美文网首页
Python网络数据采集2-wikipedia

Python网络数据采集2-wikipedia

作者: sunhaiyu | 来源:发表于2017-07-15 09:49 被阅读96次

    Python网络数据采集2-wikipedia

    随机链接跳转

    获取维基百科的词条超链接,并随机跳转。可能侧边栏和低栏会有其他链接。这不是我们想要的,所以定位到正文。正文在idbodyContentdiv标签里。

    import random
    import re
    import requests
    from bs4 import BeautifulSoup
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                      ' Chrome/52.0.2743.116 Safari/537.36 Edge/15.16193'}
    
    start_url = '/wiki/Wiki'
    
    
    def get_links(url):
        r = requests.get('https://en.wikipedia.org' + url, headers=headers)
        soup = BeautifulSoup(r.text, 'lxml')
        # /wiki/some_words
        link_list = soup.find('div', id='bodyContent').find_all('a', href=re.compile(r'^/wiki/[^/]*$'))
        return link_list
    
    
    links = get_links(start_url)
    while len(links) > 0:
        # 随机选择一个链接
        link = random.choice(links).get('href')
        print(link)
        # 新的词条覆盖了原来的超链接,一直搜寻
        links = get_links(link)
    
    /wiki/Personal_wiki
    /wiki/Database_management_system
    /wiki/Netezza
    /wiki/C%2B%2B
    /wiki/C%2B%2B#Standardization
    /wiki/ISO_9984
    /wiki/Georgian_script
    ...
    

    从首页开始,将首页的所有词条放入集合中(去重),再遍历集合,从集合中的链接递归搜索。

    import re
    import requests
    from bs4 import BeautifulSoup
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                      ' Chrome/52.0.2743.116 Safari/537.36 Edge/15.16193'}
    
    pages = set()
    
    
    def get_links(url):
        global pages
        r = requests.get('https://en.wikipedia.org' + url, headers=headers)
        soup = BeautifulSoup(r.text, 'lxml')
        # /wiki/some_words
        link_list = soup.find('div', id='bodyContent').find_all('a', href=re.compile(r'^/wiki/[^:/]*$'))
        for link in link_list:
            if link['href'] not in pages:
                new_page = link['href']
                pages.add(new_page)
                print(new_page)
                get_links(new_page)
    
    
    if __name__ == '__main__':
        # 空字符串表示,url为wiki主页https://en.wikipedia.org
        get_links('')
    

    获取词条的标题、正文

    标题在h1标签中,正文在id为mw-content-text的div标签中。

    import re
    import requests
    from bs4 import BeautifulSoup
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                      ' Chrome/52.0.2743.116 Safari/537.36 Edge/15.16193'}
    
    pages = set()
    
    
    def get_links(url):
        global pages
        r = requests.get('https://en.wikipedia.org' + url, headers=headers)
        soup = BeautifulSoup(r.text, 'lxml')
        # /wiki/some_words
        try:
            print(soup.h1.string)
            # 只打印第一段
            print(soup.find(id='mw-content-text').find('p').text)
        except AttributeError:
            print('页面缺少一些属性。')
    
        link_list = soup.find('div', id='bodyContent').find_all('a', href=re.compile(r'^/wiki/[^:/]*$'))
        for link in link_list:
            if link['href'] not in pages:
                new_page = link['href']
                pages.add(new_page)
                print('----------\n' + new_page)
                get_links(new_page)
    
    
    if __name__ == '__main__':
        # 空字符串表示,url为wiki主页https://en.wikipedia.org
        get_links('')
    
    Main Page
    Noye's Fludde is a one-act opera written largely for young amateur performers, created by the British composer Benjamin Britten. First performed in 1958 at the annual Aldeburgh Festival, it is based on the 15th-century Chester "mystery" play which recounts the biblical story of Noah, the flood and the ark. Britten had written numerous works for mixed 
    ...
    --------
    /wiki/Wikipedia
    Wikipedia
    Wikipedia (/ˌwɪkᵻˈpiːdiə/ ( listen) or /ˌwɪkiˈpiːdiə/ ( listen) WIK-i-PEE-dee-ə) is a free online encyclopedia with the aim to allow anyone to edit articles.[3] Wikipedia is the largest and most popular general reference work on the Internet[4][5][6] and is ranked among the ten most popular websites.[7] Wikipedia is owned by the nonprofit Wikimedia Foundation.[8][9][10]
    --------
    /wiki/Main_Page
    ...
    

    寻找外链

    https://www.oreilly.com开始不断寻找外链,如果某个页面没有外链,则进入该页面的某个内链,再重新找外链。感觉这个例子不是很好,因为从其他外链又可能回到初始页面。

    import re
    import random
    import requests
    from bs4 import BeautifulSoup
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                      ' Chrome/52.0.2743.116 Safari/537.36 Edge/15.16193'}
    
    
    def get_random_external_link(start_page):
        r = requests.get(start_page, headers=headers)
        soup = BeautifulSoup(r.text, 'lxml')
        # 返回分割地址的第一个元素,一般是主页的地址
        ex_links = get_external_links(soup, split_address(start_page)[0])
        # 如果该页面没有外链,则获取内链,再从内链里随机选取一个,递归,直到获取到外链为止。
        if len(ex_links) == 0:
            internal_links = get_internal_links(soup, split_address(start_page)[0])
            return get_random_external_link(random.choice(internal_links))
        else:
            return random.choice(ex_links)
    
    
    def get_internal_links(bs, include_url):
        internal_links = []
        # 找出所有以为'/'开头的链接,此为内链
        in_links = bs.find_all('a', href=re.compile(r'^/|' + include_url))
        for link in in_links:
            if link['href'] not in internal_links:
                internal_links.append(link['href'])
    
        return internal_links
    
    
    def get_external_links(bs, exclude_url):
        external_links = []
        # 找出所有以http、https开头的链接,且不含内链字符的,此为外链,(?!...)表示不包含
        ex_links = bs.find_all('a', href=re.compile(r'^(https|http)((?!' + exclude_url + ').)*$'))
        for link in ex_links:
            if link['href'] not in external_links:
                external_links.append(link['href'])
    
        return external_links
    
     
    def split_address(address):
        address_parts = []
    
        if address.split(':')[0] == 'http':
            address_parts = address.replace('http://', '').split('/')
        elif address.split(':')[0] == 'https':
            address_parts = address.replace('https://', '').split('/')
    
        return address_parts
    
    
    # 只搜索外链
    def follow_external_only(url):
        external_link = get_random_external_link(url)
        print(external_link)
        follow_external_only(external_link)
    
    
    all_ex_links = set()
    all_in_links = set()
    
    # 获得所有外链和内链,并打印了外链
    def get_all_external_links(url):
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.text, 'lxml')
        internal_links = get_internal_links(soup, split_address(url)[0])
        external_links = get_external_links(soup, split_address(url)[0])
        for link in external_links:
            if link not in all_ex_links:
                all_ex_links.add(link)
                print(link)
    
        for link in internal_links:
            if link not in all_in_links:
                all_in_links.add(link)
                get_all_external_links(link)
    
    
    if __name__ == '__main__':
        # follow_external_only('https://www.oreilly.com')
        get_all_external_links('https://www.oreilly.com')
    
    https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170601+nav
    http://shop.oreilly.com/
    http://members.oreilly.com
    https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170505+homepage+get+started+now
    https://www.safaribooksonline.com/accounts/login/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170203+homepage+sign+in
    https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170710+homepage+get+started+now
    https://www.safaribooksonline.com/public/free-trial/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170710+homepage+start+free+trial
    https://www.safaribooksonline.com/accounts/login/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170710+homepage+sign+in
    ...
    

    上面的代码经常会出错,可能是正则表达式匹配的原因,也有可能是网络原因。


    by @sunhaiyu

    2017.7.14

    相关文章

      网友评论

          本文标题:Python网络数据采集2-wikipedia

          本文链接:https://www.haomeiwen.com/subject/yhyahxtx.html