简单树匹配的算法介绍见博客。
首先将网页表示成DOM结构,然后利用简单树匹配算法计算两个网页之间的最大匹配结点数,从而得到网页结构相似度。
其中,, 分别表示两棵网页DOM树中结点数目,SimpleTreeMatching 表示这两棵树的最大匹配结点数.
计算网页结构相似度
代码:
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
import urllib.request
from bs4 import BeautifulSoup
def getNodeNum(root):
if root is None:
return 0
elif not hasattr(root, "children"):
return 0
else:
res = 1
# childrens = root.children
for ch in root.children:
res += getNodeNum(ch)
return res
def simpleTreeMath(root_a, root_b):
"""
利用动态规划,实现简单树匹配算法
:param root_a:
:param root_b:
:return:
"""
if root_a is None or root_b is None:
return 0
if root_a.name != root_b.name:
return 0
if not hasattr(root_a, "children") or not hasattr(root_b, "children"):
return 0
#
childrens_a = [x for x in root_a.children]
childrens_b = [x for x in root_b.children]
m = len(childrens_a)
n = len(childrens_b)
res_M = [[0 for j in range(n + 1)] for i in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
res_M[i][j] = max(res_M[i - 1][j], res_M[i][j - 1],
res_M[i - 1][j - 1] + simpleTreeMath(childrens_a[i - 1], childrens_b[j - 1]))
return res_M[m][n] + 1
if __name__ == "__main__":
url1 = "https://www.zhihu.com/"
f = urllib.request.urlopen(url1, timeout=5).read()
# 获取网页的html内容
f = """
<html><head><title>doc1</title></head>
<body>
<ol>
<li>Academic Service</li>
<li>Admission and Enrolment</li>
<li>Awards and Scholarships</li>
</ol>"""
soup1 = BeautifulSoup(f, 'html.parser')
root_a = soup1.html
# url2 = "https://www.csdn.net/"
# f = urllib.request.urlopen(url2, timeout=5).read()
f = """<<html><head><title>doc2</title></head>
<body>
<ul>
<li>Strudent Union</li>
<ol>
<li>Representation</li>
<li>Arts & Leisure</li>
<li>Support</li>
</ol>
<li>Graduate Student</li>
<ol>
<li>Graduate Attributes</li>
<li>Graduate Centre</li>
<li>Graduate Union</li>
</ol>
</ul>
"""
soup2 = BeautifulSoup(f, 'html.parser')
root_b = soup2.html
res = simpleTreeMath(root_a, root_b)
num_roota = getNodeNum(root_a)
num_rootb = getNodeNum(root_b)
sim_val = 2 * res / (num_roota + num_rootb)
print(res)
print(num_roota)
print(num_rootb)
print(sim_val)
结果:
4
8
15
0.34782608695652173
网友评论