思路
第一个程序已经把资源找到了, 这个程序把资源存下来, 方便重用. 思路如下:
- 用sqlite保存数据
- 如果数据已经存在, 更新, 否则创建.
这样以后使用数据库里的程序即可, 不用再找一遍.
代码
代码如下:
#! /usr/bin/python
# -*- coding: UTF-8 -*-
"""
巫师3 books的链接.
原链接如下: http://witcher.wikia.com/wiki/Category:The_Witcher_3_books
在这个链接能更好地看到数据结构, 而程序里的链接是更方便获取分页查找的链接, 结构完全相同.
作者: 萌萌哒小肥他爹
简书: blog.yunshichen.com
"""
from bs4 import BeautifulSoup
from crawler import crawl_helper
import time
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, UnicodeText, TIMESTAMP
from sqlalchemy.orm import sessionmaker,relationship
from sqlalchemy import create_engine
# ---- 爬虫配置
witcher3_books_url_template = 'http://witcher.wikia.com/index.php?action=ajax&articleId=The+Witcher+3+books&method=axGetArticlesPage&rs=CategoryExhibitionAjax&page=%d'
g_domain = 'http://witcher.wikia.com'
g_count = 0 # 计算已抓取数量
# ---- 数据库配置, 使用sqlite
db_path = 'tw3_resource.db' # 当前运行脚本的同级目录
db_engine = create_engine('sqlite:///' + db_path)
Base = declarative_base()
class WitcherResource(Base):
"""
网络抓取的巫师3资源类表
"""
__tablename__ = 'WitcherResource'
id = Column(Integer,primary_key=True,autoincrement=True)
title = Column(String(100),nullable=True) # 标题
content = Column(UnicodeText, nullable=True) # 内容
url = Column(String(500),nullable=True) # url
category = Column(String(100),nullable=True) # 分类
Base.metadata.create_all(db_engine)
Session = sessionmaker(bind=db_engine)
g_session = Session()
def do_it():
# 书目前是20页
start = 1
end = 20
for page in range(start, end + 1):
# print(page)
index_url = witcher3_books_url_template % page
find_index(index_url)
def find_index(index_url):
soup = crawl_helper.do_get(index_url, '', True)
main_books = BeautifulSoup(soup['page'], 'html.parser')
main_books = main_books.find_all('div', {'class': 'category-gallery-item'})
for div in main_books:
a_tag = div.find_all('a')[0]
title = a_tag['title']
book_url = g_domain + a_tag['href']
# print('---- book: %s, url: %s' % (title, book_url))
time.sleep(1.17)
find_detail(book_url, title)
g_session.commit()
def find_detail(book_url, title):
"""
具体格式可以看这个: http://witcher.wikia.com/wiki/Hieronymus%27_notes
:param book_url:
:return:
"""
book_html = crawl_helper.do_get(book_url, '', False)
article_div = book_html.find_all('div', {'class': 'WikiaArticle'})[0]
# wiki 里有时候用dl, 有时候用p , 咳咳...
content_tag_list = article_div.find_all('dl')
if content_tag_list is None:
content_tag_list = article_div.find_all('p')
content = None
for dl_tag in content_tag_list:
# print(dl_tag.text)
content = dl_tag.text
# todo : 未来抓取图片等信息
# 查找是否存在
check_query = g_session.query(WitcherResource).filter_by(title=title, category='books')
if check_query.first() is not None:
# 更新
check_query.update({'title': title, 'content': content, 'url': book_url})
print('--- book[%s]已存在, 仅更新: ' % title)
else:
wr = WitcherResource()
wr.title = title
wr.url = book_url
wr.content = content
wr.category = 'books'
g_session.add(wr)
g_session.flush()
print('---- 新增书: %s' % title)
if __name__ == '__main__':
do_it()
网友评论