public static function getResourceByTemp($html)
{
$html = preg_replace("/<!--.*?-->/is", '', $html); //删除注释
$html = preg_replace("/<style.*?>.*?<\/style>/is", '', $html); //删除style标签
$html = preg_replace("/<script.*?>.*?<\/script>/is", '', $html); //删除js标签
$separator = '::#::myself::#::'; //自己设定的,特有的分隔符
$html = preg_replace("/<.*?>/is", $separator, $html);
$arr = array_filter(explode($separator, $html));
$array = [];
foreach ($arr as $k => $v) {
$v = trim($v);
if (!empty($v)) {
if (strpos(PHP_EOL, $v)) {
//有些字符里面会还有换行,再分析一次并去掉字符前后的空格。
$tmp = explode(PHP_EOL, $v);
foreach ($tmp as $val) {
$val = trim($val);
if (!empty($val)) {
$array[] = $val;
}
}
} else {
//如果没有换行,直接赋值
$array[] = $v;
}
}
}
foreach ($array as $k => $v) {
//去掉纯数字的元素
if (is_numeric($v)) {
unset($array[$k]);
}
//去掉纯符号的元素
$pregStr = preg_replace("/[\x{4e00}-\x{9fa5}A-Za-z0-9]/u", '', $v);
if ($v == $pregStr) {
unset($array[$k]);
}
//去掉类似这种图标字符
if (strpos($v, '&#x') !== false && (strlen($v) == 7 || strlen($v) == 8)) {
unset($array[$k]);
}
}
$result = [];
$array = array_unique($array);
foreach ($array as $v) {
$result[] = ['text' => trim($v), 'len' => mb_strlen($v)];
}
array_multisort(array_column($result, 'len'), SORT_DESC, $result); //按字符长度倒序
return $result;
}
网友评论