美文网首页爬虫
基于Python3.6爬虫 采集知网文献

基于Python3.6爬虫 采集知网文献

作者: python与数据分析 | 来源:发表于2018-11-24 20:54 被阅读86次

    最近因公司需求采集知网数据(标题、来源、关键字、作者、单位、分类号、摘要、相似文献这些字段),由于知网防爬太强,内容页链接加密,尝试了pyspider、scrapy、selenium,都无法进入内容页,直接跳转到知网首页。于是只好采用知网的一个接口进行采集:http://yuanjian.cnki.com.cn/,以下是两个网站关于“卷积神经网络”的期刊数据量相比如下图所示:

    image.png
    image.png
    仔细观察会发现,该网站是post请求,重点是带参数请求。打开远见,搜索你想要的,按f2,查看参数里的表单数据。像我要采的是卷积神经网络,文章类型期刊,这里替换成你的参数就ok了。
    formdata = {'Type': 1,
               'Order': 1,
               'Islegal': 'false',
               'ArticleType': 1,
               'Theme': '卷积神经网络',
               'searchType': 'MulityTermsSearch',
               'ParamIsNullOrEmpty': 'true',
               'Page': i}
    

    下面是实现代码:

    # encoding='utf-8'
    import json
    import re
    from lxml import etree
    import requests
    import codecs
    
    
    class CNKI(object):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
        cookies = {
            'Cookie': 'Ecp_ClientId=4181108101501154830; cnkiUserKey=ec1ef785-3872-fac6-cad3-402229207945; UM_distinctid=166f12b44b1654-05e4c1a8d86edc-b79183d-1fa400-166f12b44b2ac8; KEYWORD=%E5%8D%B7%E7%A7%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C%24%E5%8D%B7%E7%A7%AF%20%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C; Ecp_IpLoginFail=1811121.119.135.10; amid=73b0014b-8b61-4e24-a333-8774cb4dd8bd; SID=110105; CNZZDATA1257838113=579682214-1541655561-http%253A%252F%252Fsearch.cnki.net%252F%7C1542070177'}
        param = {
            'Accept': 'text/html, */*; q=0.01',
            'Accept - Encoding': 'gzip, deflate',
            'Accept - Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep - alive',
            'Content - Type': 'application / x - www - form - urlencoded;charset = UTF - 8',
            'Host': 'yuanjian.cnki.net',
            'Origin': 'http: // yuanjian.cnki.net',
            'Referer': 'http: // yuanjian.cnki.net / Search / Result',
            'X - Requested - With': 'XMLHttpRequest'}
    
        def content(self):
            li = []
            # 遍历总页数
            for j in range(1, 134):
                for i in range(j, j + 1):
                    url = 'http://yuanjian.cnki.net/Search/Result'
                    print('当前页', i)
                    # post 传参
                    formdata = {'Type': 1,
                                'ArticleType': 1,
                                'Theme': '卷积神经网络',
                                'Page': i}
                    print(formdata)
                    try:
                        r = requests.post(url, data=formdata, headers=self.headers, cookies=self.cookies, params=self.param)
                        r.raise_for_status()
                        r.encoding = r.apparent_encoding
                        data = etree.HTML(r.text)
                        # 链接列表
                        url_list = data.xpath("//*[@id='article_result']/div/div/p[1]/a[1]/@href")
                        # 关键词列表
                        key_wordlist = []
                        all_items = data.xpath("//*[@id='article_result']/div/div")
                        for i in range(1, len(all_items) + 1):
                            key_word = data.xpath("//*[@id='article_result']/div/div[%s]/div[1]/p[1]/a/text()" % i)
    
                            key_words = ';'.join(key_word)
                            key_wordlist.append(key_words)
                        # 来源
                        source_items = data.xpath("//*[@id='article_result']/div/div")
                        for j in range(1, len(source_items) + 1):
                            sources = data.xpath("//*[@id='article_result']/div/div/p[3]/a[1]/span/text()")
                        for index, url in enumerate(url_list):
                            items = {}
                            try:
                                print('当前链接:', url)
                                content = requests.get(url, headers=self.headers)
                                contents = etree.HTML(content.text)
                                # 论文题目
                                title = contents.xpath("//h1[@class='xx_title']/text()")[0]
                                print('标题:', title)
                                # 来源
                                source = sources[index]
                                items['source'] = source
                                print('来源:', source)
                                items['title'] = title
                                # 关键字
                                each_key_words = key_wordlist[index]
                                print('关键字:', each_key_words)
                                items['keywordsEn'] = ''
                                items['keywordsCh'] = each_key_words
                                # 作者
                                author = contents.xpath("//*[@id='content']/div[2]/div[3]/a/text()")
                                items['author'] = author
                                print('作者:', author)
                                # 单位
                                unit = contents.xpath("//*[@id='content']/div[2]/div[5]/a[1]/text()")
                                units = ''.join(unit).strip(';')
                                items['unit'] = units
                                print('单位:', units)
                                # 分类号
                                classify = contents.xpath("//*[@id='content']/div[2]/div[5]/text()")[-1]
                                items['classify'] = classify
                                print('分类号:', classify)
                                # 摘要
                                abstract = contents.xpath("//div[@class='xx_font'][1]/text()")[1].strip()
                                print('摘要:', abstract)
                                items['abstractCh'] = abstract
                                items['abstractEn'] = ''
                                # 相似文献
                                similar = contents.xpath(
                                    "//*[@id='xiangsi']/table[2]/tbody/tr[3]/td/table/tbody/tr/td/text()")
                                si = ''.join(similar).replace('\r\n', '').split('期')
                                po = []
                                for i in si:
                                    sis = i + '期'
                                    if len(sis) > 3:
                                        po.append(sis)
    
                                items['similar_article'] = po
    
                                li.append(items)
    
                            except Exception as e:
                                print(e)
                            print(len(li))
                    except Exception as e:
                        print(e)
    
            return li
    
    
    if __name__ == '__main__':
        con = CNKI()
        items = con.content()
        print(items)
        try:
            with codecs.open('./cnki_data.json', 'a+', encoding="utf-8") as fp:
                for i in items:
                    fp.write(json.dumps(i, ensure_ascii=False) + ",\n")
        except IOError as err:
           print('error' + str(err))
        finally:
            fp.close()
    
    

    完~
    希望能帮助大家,小白一枚,如有不对请指正。

    相关文章

      网友评论

        本文标题:基于Python3.6爬虫 采集知网文献

        本文链接:https://www.haomeiwen.com/subject/yaoyqqtx.html