从百度文库找一份单词表内容格式如下:
原始单词.png经过php代码处理提取单词并保存到新的文件
$file = fopen("word.txt", "r");
$words=array();
$new_file = fopen('new_word.txt','a');
$i=0;
//输出文本中所有的行,直到文件结束为止。
while(! feof($file))
{
$str= fgets($file);//fgets()函数从文件指针中读取一行
preg_match('/^([a-zA-Z]+)\s+/',$str,$matches);
if (!empty($matches[1])){
$words[$i] = $matches[1];
}
$words[$i] = preg_replace('# #', '', $words[$i]);
if (!empty($words[$i])){
echo $words[$i] ."<br>";
$words[$i] = $words[$i]."\n";
fputs($new_file,$words[$i],strlen($words[$i]));
}
$i++;
}
fclose($file);
fclose($new_file);
文件内容如下:
新的文件.png获取音频
$file = fopen("new_word.txt", "r");
$words=array();
$i=0;
//输出文本中所有的行,直到文件结束为止。
while(! feof($file))
{
$str= fgets($file);//fgets()函数从文件指针中读取一行
echo $str;
$str = substr($str,0,strlen($str)-1);
echo $str;
$output = file_get_contents("http://dict.youdao.com/dictvoice?audio=$str&type=2");
file_put_contents("./records/$str".".mp3",$output);
}
fclose($file);
image.png
抓取界面
function get_word_msg($word_url, $word)
{
file_put_contents(__DIR__ . '/htmls/' . $word . ".html", file_get_contents($word_url));
$html = new simple_html_dom();
$html->load_file(__DIR__ . '/htmls/' . $word . ".html");
$web_word = $html->find('#cigencizui-word', 0)->plaintext;
if (strcasecmp($web_word, $word) != 0) {
var_dump($web_word);
var_dump($word);
return $this->error('出错');
}
$html->clear();
sleep(1);
}
function grab_word($word)
{
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, "http://www.dicts.cn/dict/dict/dict!searchhtml3.asp?id=$word");
curl_setopt($curl, CURLOPT_HEADER, 1);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$data = curl_exec($curl);
curl_close($curl);
$data = strstr($data, 'dictword');
$real_url = "http://www.dicts.cn/" . $data;
$this->get_word_msg($real_url,$word);
}
// 通过循环可以抓取所有单词
利用simple_html_dom分析网页抓取内容
public function grab_word_act(){
if (!is_dir(__DIR__ . "/htmls")){
mkdir(__DIR__ . "/htmls");
}
if (!is_dir(__DIR__ . "/images")){
mkdir(__DIR__ . "/images");
}
$file = fopen(__DIR__."/new_word.txt", "r");
$i = 0 ;
while(!feof($file)) {
$str = fgets($file);
$str = substr($str, 0, strlen($str) - 1);
if (file_exists(__DIR__ . '/htmls/' . $str . ".html")) {
$this->analysis_word($str);
}
$i++;
}
Db::table('h_dict_word')->insertAll(self::$words);
fclose($file);
}
public function analysis_word($word)
{
$html = new simple_html_dom();
$html->load_file(__DIR__ . '/htmls/' . $word . ".html");
$yinbiao = $html->find('#cigencizui-word-pron>.en-UK', 0)->innertext;
$word_mean = $html->find('#cigencizui-word-info ul', 0)->innertext;
$data = $this->getEmptyArray(array('source', 'story', 'dictionary', 'symbol', 'mean', 'name','remember'));
$web_word = $html->find('#cigencizui-word', 0)->plaintext;
if (strcasecmp($web_word, $word) != 0) {
return;
}
$data['name'] = $word;
$data['symbol'] = $yinbiao;
$data['mean'] = $word_mean;
$divs = $html->find('#cigencizui-content .page-header~div');
if (!empty($divs)) {
$flag = "";
foreach ($divs as $item) {
if (strpos($item->plaintext, '词源说明') === 0) {
$flag = "source";
} else if (strpos($item->plaintext, '21世纪大') === 0) {
$flag = 'dictionary';
$data['dictionary'] = array();
} else if (strpos($item->plaintext, "不拘一格背单词") === 0) {
$flag = "remember";
} else if(strpos($item->plaintext, "词源故事")===0 ){
$flag = 'story';
} else {
if ($flag == 'source') {
$data['source'] .= $item->innertext;
} else if ($flag == 'remember') {
$data['remember'] .= $item->innertext;
}
else if($flag == 'story'){
$data['story'] .= $item->innertext;
}
}
}
if (array_key_exists('dictionary', $data)) {
$spans = $html->find('#cigencizui-content .word');
foreach ($spans as $item) {
$data['dictionary'] = $item->innertext;
}
}
}
self::$words[] = $data;
if (count(self::$words)==10){
Db::table('h_dict_word')->insertAll(self::$words);
self::$words = array();
}
$html->clear();
}
image.png
网友评论