美文网首页
Python 从国家统计局爬取全国各省市区数据到mysql

Python 从国家统计局爬取全国各省市区数据到mysql

作者: Rinaloving | 来源:发表于2020-05-22 22:52 被阅读0次
    # -*- coding: utf-8 -*-
    import re
    import requests
    import time
    import operator
    import pymysql
    from functools import reduce
    import uuid
    
    n = 0
    save_route = 'E://China_Province_2019_test.txt'  #数据储存路径
    conn = pymysql.connect(
     host='localhost',
     user='root',
     passwd='root',
     database='province',
     charset='utf8'
    )
    code = str(uuid.uuid4())
    
    
    def fun_getName(result4):
       result4a = []
       s=0
       for i4a in result4:
           if '0' in i4a[1]:
               s += 1
           else:
               result4a.append(i4a)
       return result4a
    def fun_write_to_txt(address):
       with open(save_route, 'a', encoding='utf-8')as f:
               f.write("---"+address)
               f.write('\n')
               f.close()
    
    def fun_Insert_to_db(pkcode,value,fkcode,ntype):
       cursor = conn.cursor()
       sql = "INSERT INTO tbprovince (pkcode,sname,fkcode, ntype) VALUES (%s,%s,%s,%s)"
       val = (pkcode,value,fkcode,ntype)
       cursor.execute(sql, val)
       conn.commit()
    
    def fun_Query_from_db(value,fkcode):
       cursor = conn.cursor()
       #sql ="select pkcode from tbprovince where sname='"+str(value)+"'"
       sql ="select pkcode from tbprovince where sname='"+str(value)+"' and fkcode = '"+fkcode+"'"
       print(sql)
       res = cursor.execute(sql)
       print(res)
       #ss = cursor.fetchone()
       #print(ss[0])
       #cursor.close()
       #conn.close()
       return res
    def fun_Get_fkcode_db(value):
       cursor = conn.cursor()
       sql ="select fkcode from tbprovince where sname='"+str(value)+"'"
       print(sql)
       res = cursor.execute(sql)
       if(res):
           ss = cursor.fetchone()
       print(ss[0])
       #cursor.close()
       #conn.close()
       return ss[0]
    def fun_Get_pkcode_db(value,fkcode):
       cursor = conn.cursor()
       sql ="select pkcode from tbprovince where sname='"+str(value)+"'and fkcode = '"+fkcode+"'"
       print(sql)
       res = cursor.execute(sql)
       if(res):
           ss2 = cursor.fetchone()
       print(ss2[0])
       #cursor.close()
       #conn.close()
       return ss2[0]
    
    
    # areas = ['月湖区','余江区','贵溪市']
    # for area in range(len(areas)):
    #     pkid  = str(uuid.uuid4())
    #     fun_Insert_to_db(pkid,areas[area],"d28bb303-64aa-4abd-954c-258b2d0fe992","2")
    
    results2 = []
    results3 = []
    results4 = []
    results5 = []
    Dates1 = []
    kv = {'user-agent': 'Mozilla/5.0'}
    
    url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'
    r = requests.get(url, headers=kv)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    pattern = re.compile("<a href='(.*?)'>(.*?)<")   # 正则表达式
    result1 = list(set(re.findall(pattern, r.text)))  # 从主页面获取子页面的html
    print('result1')
    #print(result1)
    i2 = 0
    for i2 in range(len(result1)):
       try:
           url2a = result1[i2][0]
           address1 = result1[i2][1]  # 一级地址
           #fun_write_to_txt(address1)
           if(fun_Query_from_db(address1,'')):
               print(address1) #存在记录就把pkcode取出来
               code2 = fun_Get_pkcode_db(address1,'')
           else:
               code2 = str(uuid.uuid4())#不存在记录就插入一条
               fun_Insert_to_db(code2,str(address1),"","0")
           
           i2 += 1
           url2 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + url2a
           #http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/44.html
           #print(url2)
           #print(address1)
           time.sleep(3)
           r2 = requests.get(url2, headers=kv)
           r2.raise_for_status()
           r2.encoding = r2.apparent_encoding
           pattern2 = re.compile("<a href='(.*?)'>(.*?)<")  # 正则表达式提取目标字段
           result2 = list(set(re.findall(pattern2, r2.text)))
           shinames =  fun_getName(result2)
           m2  = 0
           for m2 in range(len(shinames)):
               url2a2 = shinames[m2][0]
               address2 = shinames[m2][1]  # 一级地址
               #fun_write_to_txt(address2)
               fcode3 = code2
               if(fun_Query_from_db(address2,fcode3)):
                   fcode3 = fcode3 #存在记录就把fkcode取出来
                   code3  = fun_Get_pkcode_db(address2,fcode3)
               else:
                   code3 = str(uuid.uuid4())#不存在记录就插入一条
                   fun_Insert_to_db(code3,str(address2),fcode3,"1")
               m2 += 1
               url2 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + url2a2
               #http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/44.html
               #print(url2)
               #print(address1)
               time.sleep(3)
               r2 = requests.get(url2, headers=kv)
               r2.raise_for_status()
               r2.encoding = r2.apparent_encoding
               pattern2 = re.compile("<a href='(.*?)'>(.*?)<")  # 正则表达式提取目标字段
               result3 = list(set(re.findall(pattern2, r2.text)))
               shinames2 =  fun_getName(result3)
               m3 = 0
               for m3 in range(len(shinames2)):
                   #url2a3 = shinames[m3][0]
                   address3 = shinames2[m3][1]  # 一级地址
                   #fun_write_to_txt(address3)
                   fcode4 = code3
                   if(fun_Query_from_db(address3,fcode4)):
                       fcode4 = fcode4 #存在记录就把fkcode取出来
                       code4  = fun_Get_pkcode_db(address3,fcode4)
                   else:
                       code4 = str(uuid.uuid4())#不存在记录就插入一条
                       fun_Insert_to_db(code4,str(address3),fcode4,"2")
                   m3 += 1
                 
               #print(shinames)
    
       except Exception as e:
           print(e)
           pass
    
    print('well_done')
    
    
    省市区.png

    相关文章

      网友评论

          本文标题:Python 从国家统计局爬取全国各省市区数据到mysql

          本文链接:https://www.haomeiwen.com/subject/nqrhahtx.html