SAX解析XML文件
基本使用方法
import java.io.IOException;
import java.util.ArrayList;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import com.garlick.xml.decode.Decode;
public class SaxXmlDecode extends Decode {
public void decode() {
SAXParserFactory factory = SAXParserFactory.newInstance();
try {
factory.newSAXParser().parse(COMPANY_FILE_NAME, new MyHandler());
} catch (IOException | SAXException | ParserConfigurationException e) {
e.printStackTrace();
}
}
private class MyHandler extends DefaultHandler {
private ArrayList<Group> groups;
private Group group;
private boolean staff = false;
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
super.characters(ch, start, length);
if (staff && (group != null) && group.staffs != null) {
group.staffs.add(new String(ch, start, length));
}
}
@Override
public void endDocument() throws SAXException {
super.endDocument();
print(groups);
}
@Override
public void startDocument() throws SAXException {
super.startDocument();
groups = new ArrayList<Group>();
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes)
throws SAXException {
super.startElement(uri, localName, qName, attributes);
if (GROUP_ELEMENT_TAG_NAME.equals(qName)) {
group = new Group();
} else if (LEADER_ELEMENT_TAG_NAME.equals(qName)) {
if (group != null) {
if (group.leaders == null) {
group.leaders = new ArrayList<String>();
}
if (attributes.getValue("name") != null) {
group.leaders.add(attributes.getValue("name"));
}
}
} else if (STAFF_ELEMENT_TAG_NAME.equals(qName)) {
if (group != null && group.staffs == null) {
group.staffs = new ArrayList<String>();
}
staff = true;
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
super.endElement(uri, localName, qName);
if (GROUP_ELEMENT_TAG_NAME.equals(qName)) {
if (group != null) {
groups.add(group);
}
} else if (STAFF_ELEMENT_TAG_NAME.equals(qName)) {
staff = false;
}
}
}
private class Group {
ArrayList<String> leaders;
ArrayList<String> staffs;
}
private void print(ArrayList<Group> groups) {
if (groups != null && groups.size() > 0) {
System.out.println(COMPANY_ELEMENT_TAG_NAME);
for (int index = 0; index < groups.size(); index++) {
System.out.println("\t" + GROUP_ELEMENT_TAG_NAME + " " + (index + 1));
Group group = groups.get(index);
if (group.leaders != null && group.leaders.size() > 0) {
for (String leader : group.leaders) {
System.out.println("\t\t" + LEADER_ELEMENT_TAG_NAME + ":\t" + leader);
}
}
if (group.staffs != null && group.staffs.size() > 0) {
for (String staff : group.staffs) {
System.out.println("\t\t" + STAFF_ELEMENT_TAG_NAME + ":\t" + staff);
}
}
}
}
}
}
详细源码解析
SAXParserImpl对象的初始化
在使用SAX解析XML文件的过程中,首先,先通过其newInstance函数初始化一个SAXParserFactory对象,
public static SAXParserFactory newInstance() {
// instantiate the class directly rather than using reflection
// 初始化一个SAXPareserFactoryImpl对象
return new SAXParserFactoryImpl();
}
直接new一个SAXParserFactory的子类SAXParserFactoryImpl对象 然后,调用其newSAXParser函数
@Override
public SAXParser newSAXParser() throws ParserConfigurationException {
// ...... 条件判断,分支无法进入,省略
try {
return new SAXParserImpl(features);
}
// ...... catch exception, code delete
}
也就是说,在这边直接初始化一个SAXParserImpl对象
SAXParserImpl(Map<String, Boolean> initialFeatures)
throws SAXNotRecognizedException, SAXNotSupportedException {
this.initialFeatures = initialFeatures.isEmpty()
? Collections.<String, Boolean>emptyMap()
: new HashMap<String, Boolean>(initialFeatures);
resetInternal();
}
private void resetInternal()
throws SAXNotSupportedException, SAXNotRecognizedException {
reader = new ExpatReader();
for (Map.Entry<String,Boolean> entry : initialFeatures.entrySet()) {
reader.setFeature(entry.getKey(), entry.getValue());
}
}
解析XML文件
public void parse(String uri, DefaultHandler dh)
throws SAXException, IOException {
// ...... 判空条件判断代码省略
InputSource input = new InputSource(uri);
this.parse(input, dh);
}
初始化InputSource对象,然后将其作为参数,调用重载函数parse
public void parse(InputSource is, DefaultHandler dh)
throws SAXException, IOException {
// ...... 判空条件判断代码省略
// 获取XMLReader对象
XMLReader reader = this.getXMLReader();
if (dh != null) {
reader.setContentHandler(dh);
reader.setEntityResolver(dh);
reader.setErrorHandler(dh);
reader.setDTDHandler(dh);
}
reader.parse(is);
}
reader为刚刚在SAXParserImpl初始化过程中,初始化的一个ExpatReader对象,因此直接调用ExpatReader的parse函数
public void parse(InputSource input) throws IOException, SAXException {
// ...... 判空条件判断代码省略
Reader reader = input.getCharacterStream();
if (reader != null) {
try {
parse(reader, input.getPublicId(), input.getSystemId());
}
// ......
return;
}
// Try the byte stream.
InputStream in = input.getByteStream();
String encoding = input.getEncoding();
// null
if (in != null) {
try {
parse(in, encoding, input.getPublicId(), input.getSystemId());
}
// ......
return;
}
String systemId = input.getSystemId();
// ......
// Try the system id.
// 创建URLConnection,然后调用重载函数
in = ExpatParser.openUrl(systemId);
try {
parse(in, encoding, input.getPublicId(), systemId);
} finally {
IoUtils.closeQuietly(in);
}
}
从上述代码看,这边创建了一个URLConnection,然后调用重载parse函数
private void parse(InputStream in, String charsetName, String publicId, String systemId)
throws IOException, SAXException {
// 初始化ExpatParser对象
ExpatParser parser = new ExpatParser(charsetName, this, processNamespaces, publicId, systemId);
parser.parseDocument(in);
}
初始化一个ExpatParser对象,然后调用其parseDocument函数 a. 初始化ExpatParser对象 这边使用了new直接初始化ExpatParser
/*package*/ ExpatParser(String encoding, ExpatReader xmlReader,
boolean processNamespaces, String publicId, String systemId) {
// ......
this.encoding = encoding == null ? DEFAULT_ENCODING : encoding;
this.pointer = initialize(
this.encoding,
processNamespaces
);
}
// native initialize 函数
private native long initialize(String encoding, boolean namespacesEnabled);
这边调用了native方法initialize函数(org_apache_harmony_xml_ExpatParser.cpp)
static jlong ExpatParser_initialize(JNIEnv* env, jobject object, jstring javaEncoding,
jboolean processNamespaces) {
// Allocate parsing context.
std::unique_ptr<ParsingContext> context(new ParsingContext(object));
// ......
context->processNamespaces = processNamespaces;
// Create a parser.
XML_Parser parser;
ScopedUtfChars encoding(env, javaEncoding);
// ......
if (processNamespaces) {
// Use '|' to separate URIs from local names.
parser = XML_ParserCreateNS(encoding.c_str(), '|');
} else {
parser = XML_ParserCreate(encoding.c_str());
}
// ...... 设置默认数据
return fromXMLParser(parser);
}
设置一些默认的处理函数(external/expat/) 初始化ExpatParser对象,使用XML_ParserCreateNS函数,这个函数位于external下的xmlparse.c文件中
XML_Parser XMLCALL
XML_ParserCreateNS(const XML_Char *encodingName, XML_Char nsSep)
{
XML_Char tmp[2];
*tmp = nsSep;
return XML_ParserCreate_MM(encodingName, NULL, tmp);
}
即,调用XML_ParserCreate_MM函数
XML_Parser XMLCALL
XML_ParserCreate_MM(const XML_Char *encodingName,
const XML_Memory_Handling_Suite *memsuite,
const XML_Char *nameSep)
{
return parserCreate(encodingName, memsuite, nameSep, NULL);
}
// 然后调用parserCreate函数
static XML_Parser
parserCreate(const XML_Char *encodingName,
const XML_Memory_Handling_Suite *memsuite,
const XML_Char *nameSep,
DTD *dtd)
{
XML_Parser parser;
// ......
{
XML_Memory_Handling_Suite *mtemp;
// 申请xml解析器的内存
parser = (XML_Parser)malloc(sizeof(struct XML_ParserStruct));
if (parser != NULL) {
mtemp = (XML_Memory_Handling_Suite *)&(parser->m_mem);
mtemp->malloc_fcn = malloc;
mtemp->realloc_fcn = realloc;
mtemp->free_fcn = free;
}
}
// ......初始化一些默认参数
return parser;
}
这边主要初始化parser,并且初始化一些参数,最后会调用parserInit函数进行初始化
static void
parserInit(XML_Parser parser, const XML_Char *encodingName) {
// ...... 初始化默认参数
}
如上,直接初始化XML_Parser对象,并为其初始化一些默认值 在初始化完成后,调用XML_SetNamespaceDeclHandler等函数设置其一些初始化值,此处不做分析,有兴趣可以自行分析 至此,XML_Parser解析器初始化完成
b. 此后,将处理解析文档 这边主要是调用了ExpatParser的parseDocument函数
/*package*/ void parseDocument(InputStream in) throws IOException,
SAXException {
startDocument();
parseFragment(in);
finish();
endDocument();
}
即,调用了四个函数,最先和最后使用startDocument和endDocument,这个最后会调用到传入的DefaultHandler的startDocument和endDocument的函数 那么接下来看解析XML文件的主要内容函数parseFragment
private void parseFragment(InputStream in)
throws IOException, SAXException {
byte[] buffer = new byte[BUFFER_SIZE];
int length;
while ((length = in.read(buffer)) != -1) {
try {
appendBytes(this.pointer, buffer, 0, length);
}
// ...... catch exception code delete
}
}
private native void appendBytes(long pointer, byte[] xml, int offset,
int length) throws SAXException, ExpatException;
从这边可以看到,这边是顺序读取xml文档的内容到内存,然后进行解析(最大为BUFFER_SIZE),然后通过appendBytes函数进行解析,因此
static void ExpatParser_appendBytes(JNIEnv* env, jobject object, jlong pointer,
jbyteArray xml, jint byteOffset, jint byteCount) {
ScopedByteArrayRO byteArray(env, xml);
// ......
const char* bytes = reinterpret_cast<const char*>(byteArray.get());
append(env, object, pointer, bytes, byteOffset, byteCount, XML_FALSE);
}
static void append(JNIEnv* env, jobject object, jlong pointer,
const char* bytes, size_t byteOffset, size_t byteCount, jboolean isFinal) {
XML_Parser parser = toXMLParser(pointer);
ParsingContext* context = toParsingContext(parser);
context->env = env;
context->object = object;
if (!XML_Parse(parser, bytes + byteOffset, byteCount, isFinal) && !env->ExceptionCheck()) {
// ......
}
context->object = NULL;
context->env = NULL;
}
通过XML_Parse函数解析
enum XML_Status XMLCALL
XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
if ((parser == NULL) || (len < 0) || ((s == NULL) && (len != 0))) {
if (parser != NULL)
parser->m_errorCode = XML_ERROR_INVALID_ARGUMENT;
return XML_STATUS_ERROR;
}
switch (parser->m_parsingStatus.parsing) {
case XML_SUSPENDED:
parser->m_errorCode = XML_ERROR_SUSPENDED;
return XML_STATUS_ERROR;
case XML_FINISHED:
parser->m_errorCode = XML_ERROR_FINISHED;
return XML_STATUS_ERROR;
// 初始化为此值
case XML_INITIALIZED:
if (parser->m_parentParser == NULL && !startParsing(parser)) {
parser->m_errorCode = XML_ERROR_NO_MEMORY;
return XML_STATUS_ERROR;
}
/* fall through */
default:
// 开始解析
parser->m_parsingStatus.parsing = XML_PARSING;
}
// ......
{
void *buff = XML_GetBuffer(parser, len);
if (buff == NULL)
return XML_STATUS_ERROR;
else {
memcpy(buff, s, len);
// 解析buffer
return XML_ParseBuffer(parser, len, isFinal);
}
}
}
最后调用XML_ParseBuffer函数进行数据解析
enum XML_Status XMLCALL
XML_ParseBuffer(XML_Parser parser, int len, int isFinal) {
const char *start;
enum XML_Status result = XML_STATUS_OK;
if (parser == NULL)
return XML_STATUS_ERROR;
switch (parser->m_parsingStatus.parsing) {
case XML_SUSPENDED:
parser->m_errorCode = XML_ERROR_SUSPENDED;
return XML_STATUS_ERROR;
case XML_FINISHED:
parser->m_errorCode = XML_ERROR_FINISHED;
return XML_STATUS_ERROR;
case XML_INITIALIZED:
if (parser->m_parentParser == NULL && !startParsing(parser)) {
parser->m_errorCode = XML_ERROR_NO_MEMORY;
return XML_STATUS_ERROR;
}
/* fall through */
default:
parser->m_parsingStatus.parsing = XML_PARSING;
}
// 初始化数据
start = parser->m_bufferPtr;
parser->m_positionPtr = start;
parser->m_bufferEnd += len;
parser->m_parseEndPtr = parser->m_bufferEnd;
parser->m_parseEndByteIndex += len;
parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;
// 调用m_processor函数,解析XML数据,这边的值在XML_Parser对象初始化的时候设置为prologInitProcessor
parser->m_errorCode = parser->m_processor(parser, start, parser->m_parseEndPtr, &parser->m_bufferPtr);
if (parser->m_errorCode != XML_ERROR_NONE) {
parser->m_eventEndPtr = parser->m_eventPtr;
parser->m_processor = errorProcessor;
return XML_STATUS_ERROR;
} else {
switch (parser->m_parsingStatus.parsing) {
case XML_SUSPENDED:
result = XML_STATUS_SUSPENDED;
break;
case XML_INITIALIZED:
case XML_PARSING:
if (isFinal) {
parser->m_parsingStatus.parsing = XML_FINISHED;
return result;
}
default: ; /* should not happen */
}
}
XmlUpdatePosition(parser->m_encoding, parser->m_positionPtr, parser->m_bufferPtr, &parser->m_position);
parser->m_positionPtr = parser->m_bufferPtr;
return result;
}
调用prologInitProcessor函数,解析数据 此后,通过while循环查找对应的数据,读取到内存,重复此阶段,完成所有数据解析
SAX解析XML总结
使用方法
1) 通过SAXParserFactory的newInstance函数创建一个SAXParserFactory对象,再通过其newSAXParser函数,初始化一个SAXParserImpl对象,然后调用其parse函数,将xml文件名和初始化的继承自DefaultHandler类的对象一起作为其参数
2) 在继承自DefaultHandler类的对象中,重新startDocument/endDocument/startElement/endElement/charactors函数,然后一步步解析该xml文件即可
源码分析
1) 通过SAXParserFactory的newInstance函数创建一个SAXParserFactoryImpl对象,然后通过其newSAXParser函数创建一个SAXParserImpl对象
2) 在创建SAXParserImpl对象的时候,初始化一个ExpatReader对象
3) 调用SAXParserImpl对象的parse函数时候,将xml文件名和初始化的继承自DefaultHandler类的对象作为参数
4) 在parse函数调用的时候,直接调用ExpatReader对象的parse函数,在该parse函数调用的过程中,初始化一个ExpatParser对象,并且调用其parseDocument函数
5) 在初始化ExpatParser对象的时候,调用libexpat库的initialize函数,在该函数中初始化native层的XML_Parser解析器
6) 在调用其parseDocument的时候,将xml文件每一行读取后,进行解析,并随后一步步解析
SAX解析XML的优缺点
- 由于SAX解析过程中,在native层进行解析,因此解析速度比较块
- 由于SAX解析xml的过程中,是读取部分数据进行解析,因此使用内存相对较少,而且较固定
- 由于SAX解析xml的过程中,需要重写DefaultHandler的一些函数,并且对其部分函数需要重写,而且需要一步步进行解析,因此需要对xml文件的内容,有个相应的了解
扩展
由于SAXParserFactory提供了两个重载函数newInstance,因此在有两个函数的重载newInstance可以客制化自己的解析器。
网友评论