1) import xml.dom.minidom
import xml.dom.minidom
f= open("E:/Y.xml",encoding="utf-8")
domtree = xml.dom.minidom.parse(f)
domtree = xml.dom.minidom.parseString('') # 解析String类型
rootnode = domtree.documentElement #根节点
rootname = rootnode.NodeName #根节点名
terms = domtree.getElementByTagName('term') #名为Term的元素
#terms.length
for term in terms:
for t in term.childNodes:
print(t.nodeValue) #每一个term的值
#如果选择奇数或偶数的可以如下:
flag = 0
for t in term:
for i in t.childNodes:
if flag%2==0:
print(i.nodeValue)
else:
continue
flag+=1
2) import xml.etree.ElementTree as ET
<?xml version="1.0" encoding="utf-8"?>
<tmx version="1.4">
<header creationtool="SDL Language Platform" creationtoolversion="8.1" o-tmf="SDL TM8 Format" datatype="xml" segtype="sentence" adminlang="en-US" srclang="en-US" creationdate="20181122T094526Z" creationid="SKY-20170626EEO\transn">
<prop type="x-Recognizers">RecognizeAll</prop>
<prop type="x-IncludesContextContent">True</prop>
<prop type="x-TMName">xml</prop>
<prop type="x-TokenizerFlags">DefaultFlags</prop>
<prop type="x-WordCountFlags">DefaultFlags</prop>
</header>
<body>
<tu creationdate="20181128T013846Z" creationid="SKY-20170626EEO\transn" changedate="20181128T013853Z" changeid="SKY-20170626EEO\transn" lastusagedate="20181128T044856Z" usagecount="9">
<prop type="x-LastUsedBy">SKY-20170626EEO\transn</prop>
<prop type="x-Context">0, 0</prop>
<prop type="x-Context">7085221958494716325, 9123451527759037284</prop>
<prop type="x-ContextContent">Please check your SD card. | | 请检查你的SD卡 | </prop>
<prop type="x-Origin">TM</prop>
<prop type="x-ConfirmationLevel">Translated</prop>
<tuv xml:lang="en-US">
<seg>UC Browser</seg>
</tuv>
<tuv xml:lang="zh-CN">
<seg>UC浏览器</seg>
</tuv>
</tu>
<tu creationdate="20181128T013900Z" creationid="SKY-20170626EEO\transn" changedate="20181128T013901Z" changeid="SKY-20170626EEO\transn" lastusagedate="20181128T032310Z" usagecount="4">
<prop type="x-LastUsedBy">SKY-20170626EEO\transn</prop>
<prop type="x-Context">0, 0</prop>
<prop type="x-Context">1803647357637868, 108186968084773</prop>
<prop type="x-ContextContent">UC Browser | | UC浏览器 | </prop>
<prop type="x-Origin">TM</prop>
<prop type="x-ConfirmationLevel">Translated</prop>
<tuv xml:lang="en-US">
<seg>Search</seg>
</tuv>
<tuv xml:lang="zh-CN">
<seg>搜索</seg>
</tuv>
</tu>
</body>
</tmx>
<code>
import xml.etree.ElementTree as ET
tree=ET.parse("D:/xml.xml")
root=tree.getroot() <!--获取元素数的根节点-->
root.tag <!--标签-->
root.attrib<!--属性-->
root.text<!--内容-->
root.find('body')<!--通过寻找标签的名字来获取子元素-->
root.findall('body')<!--返回一个列表-->
root.iterfind('string')<!--获得一个可迭代对象
以上方法只是适用于root这个根节点的直接子元素,对于body的子元素无效。
若要找到body下面的子元素,咋办?-->
root.iter() <!--得到一个生成器对象,包括下面所有元素-->
list(root.iter)<!--返回一个列表,有多少元素就返回多少元素-->
len(list(root.iter))<!--个数-->
root.iter('tuv')<!--生成器对象,找到所有标签为tuv-->
<!--查出seg-->
for tuv in root.iter('tuv'):<!--迭代器-->
attrib = tuv.attrib.get("shuxing")
seg = tuv.find('seg')
print(seg.text)
<!--下面这个就是将所有的seg都查出-->
for tuv in root.iter('tuv'):
seg = tuv.find('seg')
print(seg.text)
<!--第三种-->
for tu in root.iter('tu'):
tuv = tu.find('tuv')<!--tu下面有多个tuv,使用find而不是findall,找到第一个tuv-->
seg = tuv.find('seg')<!--tuv下面有多个seg,用find找到第一个seg-->
<!--根节点是一个,如果查找根节点的下一级,就直接for e in root-->
root.findall("./body/tu/tuv/seg")
root.findall("body/*/*[@language='en-US']"
for s in root.findall(".//tuv"):
print(s.items()) <!--items() 返回列表,属性的(name,value)-->
<!--
tag------Selects all child elements with the given tag. For example, spam selects all child elements named spam, and spam/egg selects all grandchildren named egg in all children named spam.
*---------Selects all child elements. For example, */egg selects all grandchildren named egg.
.---------Selects the current node. This is mostly useful at the beginning of the path, to indicate that it’s a relative path.
//--------Selects all subelements, on all levels beneath the current element. For example, .//egg selects all egg elements in the entire tree.
.. --------Selects the parent element.
[@attrib]-------Selects all elements that have the given attribute.
[@attrib='value']-----------Selects all elements for which the given attribute has the given value. The value cannot contain quotes.
[tag] -----------Selects all elements that have a child named tag. Only immediate children are supported.
[tag='text']------Selects all elements that have a child named tag whose complete text content, including descendants, equals the given text.
[position]--------Selects all elements that are located at the given position. The position can be either an integer (1 is the first position), the expression last() (for the last position), or a position relative to the last position (e.g. last()-1).
-->
root.findall(".//*[@language='zh-CN']/seg")
for i in root.iter("tuv"):
if i.attrib["language"]=="zh-CN":
seg = i.find("seg")
print(seg.text)
<time>
20181129 17:20
</time>
</code>
3) from lxml import etree
from lxml import etree
A: doc = etree.fromstring(result.content) #网页 字符串
B: doc = etree.parse("E:/Y.xml") #文件名
doc.tag #tag名
doc.attrib['term'] #属性
doc.findall("slide") #子节点为slide的所有元素
doc.findall(".//term") #子孙节点
for item in doc.findall(".//item"):
print(item.text)
解析html
html = etree.parse("E:/Y.html") #文件名
result = etree.tostring(html, encoding="utf-8", pretty_print=True, method="html")
result.decode('utf-8')
html.xpath('//li[@class="item-0"]//text()') #
result=html.xpath('//li[@class="item-1"]/a/text()') #获取a节点下的内容
result1=html.xpath('//li[@class="item-1"]//text()') #获取li下所有子孙节点的内容
网友评论