美文网首页
Sqlite3数据库使用

Sqlite3数据库使用

作者: 戒灵 | 来源:发表于2018-11-01 18:40 被阅读0次

    1,下载 SQLiteStudio可视化界面(便于手动创建表)
    2,我们写一个文件专门对DB

    # -*- coding: UTF-8 -*-
    
    import os
    import sys
    import sqlite3
    
    # sys.path.append('.')
    # from common.local_util import *
    # import local_util
    
    class DB:
        new_db = False
    
        def __init__(self, sqlite3_db='test.db', check_same_thread=False):
            self.sqlite3_db        = sqlite3_db
            self.check_same_thread = check_same_thread
    
            if not os.path.exists(sqlite3_db):
                self.new_db = True
    
            self.conn = sqlite3.connect(sqlite3_db, isolation_level=None, check_same_thread=self.check_same_thread)
            self.cursor = self.conn.cursor()
            self.create_db()
    
        def query(self, sql, param=None):
            if param:
                res = self.cursor.execute(sql, param)
            else:
                res = self.cursor.execute(sql)
            self.conn.commit()
            return res
    
        def create_db(self):
            if self.new_db:
                self.new_db = False
                sql = '''CREATE TABLE amazon_bestseller_category (
                    category_id  INTEGER NOT NULL
                                         REFERENCES amazon_bestseller_list (id),
                    rank         INTEGER,
                    top_rank     INTEGER DEFAULT (0),
                    review       INTEGER DEFAULT (0),
                    star         TEXT    DEFAULT (0),
                    gen_date     TEXT    DEFAULT (''),
                    price        TEXT    DEFAULT (''),
                    detail_link  TEXT    NOT NULL,
                    sale_monthly TEXT    DEFAULT (0),
                    is_active    INTEGER DEFAULT (0),
                    create_time  TEXT,
                    UNIQUE (
                        category_id,
                        detail_link
                    )
                )'''
                self.query(sql)
                sql = '''CREATE TABLE amazon_bestseller_list (
                    id              INTEGER PRIMARY KEY AUTOINCREMENT,
                    top_category    TEXT    NOT NULL,
                    category        TEXT    DEFAULT (''),
                    category_url TEXT    NOT NULL,
                    level           INTEGER NOT NULL
                                            DEFAULT (0),
                    is_active       INTEGER DEFAULT (0),
                    UNIQUE (
                        top_category,
                        bestseller_link
                    )
                )'''
                self.query(sql)
    
        def close(self):
            self.conn.close()
    

    3.sqlite3查询语句:

    sql = "SELECT a.detail_link,b.top_category FROM amazon_bestseller_category a LEFT JOIN amazon_bestseller_list b ON a.category_id = b.id WHERE (a.top_rank=0 or a.sale_monthly=0) and a.is_active = 1 limit 3000"
    

    4.sqlite3更新语句:

            params=(star,dateFirstAvailable,top_rank,sale_monthly,detail_link)
            self.db.query(sql, params)
    

    5.sqlite3插入语句

                params = (self.category_id,rank,review,link,price,'1',create_time)
                self.db.query(sql, params)
    

    6,最终例子:(查询两个表中的关联字段及想要的字段并更新3000条)

    # -*- coding: utf-8 -*-
    
    # import re
    # import json
    #import MySQLdb
    import time
    import datetime
    import os
    import sys
    import argparse
    
    # sys.path.append('.')
    # from common import local_db, local_util
    
    import scrapy
    import requests
    from scrapy.crawler import CrawlerProcess
    from scrapy.crawler import CrawlerRunner
    from scrapy.spiders import BaseSpider
    from scrapy.spiders.init import InitSpider
    from scrapy.http import Request, FormRequest
    from scrapy.http.request.form import _get_form,_get_inputs
    from scrapy.linkextractors import LinkExtractor
    from scrapy.exceptions import CloseSpider
    from scrapy.spiders import Rule
    from scrapy.selector import Selector
    from scrapy.http.cookies import CookieJar
    
    import local_db
    
    # 以下打包的时候必须
    # import robotparser
    
    import scrapy.spiderloader
    import scrapy.statscollectors
    import scrapy.logformatter
    import scrapy.dupefilters
    import scrapy.squeues
    
    import scrapy.extensions.spiderstate
    import scrapy.extensions.corestats
    import scrapy.extensions.telnet
    import scrapy.extensions.logstats
    import scrapy.extensions.memusage
    import scrapy.extensions.memdebug
    import scrapy.extensions.feedexport
    import scrapy.extensions.closespider
    import scrapy.extensions.debug
    import scrapy.extensions.httpcache
    import scrapy.extensions.statsmailer
    import scrapy.extensions.throttle
    
    import scrapy.core.scheduler
    import scrapy.core.engine
    import scrapy.core.scraper
    import scrapy.core.spidermw
    import scrapy.core.downloader
    
    import scrapy.downloadermiddlewares.stats
    import scrapy.downloadermiddlewares.httpcache
    import scrapy.downloadermiddlewares.cookies
    import scrapy.downloadermiddlewares.useragent
    import scrapy.downloadermiddlewares.httpproxy
    import scrapy.downloadermiddlewares.ajaxcrawl
    # import scrapy.downloadermiddlewares.chunked
    import scrapy.downloadermiddlewares.decompression
    import scrapy.downloadermiddlewares.defaultheaders
    import scrapy.downloadermiddlewares.downloadtimeout
    import scrapy.downloadermiddlewares.httpauth
    import scrapy.downloadermiddlewares.httpcompression
    import scrapy.downloadermiddlewares.redirect
    import scrapy.downloadermiddlewares.retry
    import scrapy.downloadermiddlewares.robotstxt
    
    import scrapy.spidermiddlewares.depth
    import scrapy.spidermiddlewares.httperror
    import scrapy.spidermiddlewares.offsite
    import scrapy.spidermiddlewares.referer
    import scrapy.spidermiddlewares.urllength
    
    import scrapy.pipelines
    
    import scrapy.core.downloader.handlers.http
    import scrapy.core.downloader.contextfactory
    
    
    class AmazonBestsellerCategorySpider(InitSpider):
        name = 'amazon_bestseller_category'
        allowed_domains = ['www.amazon.com']
        try_login_max_time = 3
    
        def __init__(self, page_url=None, rank_max='20'):
            self.page_url = page_url
            self.rank_max = int(rank_max)
            self.db = local_db.DB('./data/data.db')
            sql = "select id, category_url from amazon_bestseller_list where category_url=?"
            cur = self.db.query(sql, (page_url,))
            row = cur.fetchone()
            # self.log(row)
            self.category_id = row[0] if row else 0
    
        def init_request(self):
            yield Request(url=self.page_url,callback=self.list_page)
    
        def list_page(self,response):
            sel = Selector(response)
            row_ele = sel.xpath("//ol[@id='zg-ordered-list']//li")
            for row in row_ele:
                rank = row.xpath(".//span/span[contains(@class,'zg-badge-text')]/text()").extract_first()
                rank = rank.replace('#', '')
                review = row.xpath(".//span[contains(@class,'aok-inline-block')]/div[contains(@class,'a-icon-row')]/a[contains(@class,'a-size-small')]/text()").extract_first()
                if not review:
                    review = '0'
                price = row.xpath(".//span[contains(@class,'p13n-sc-price')]/text()").extract_first()
                detail_url = row.xpath(".//span[contains(@class,'zg-item')]/a[@class='a-link-normal']/@href").extract_first()
                detail_url_pase = response.urljoin(detail_url)
                link_dp = detail_url_pase[detail_url_pase.find('/dp/'):]
                link_ref =response.urljoin(link_dp)
                link = link_ref[:link_ref.find('/ref')]
                create_time=datetime.datetime.now().strftime('%Y-%m-%d')
    
                if int(rank) > self.rank_max:
                    continue
                sql = "INSERT OR REPLACE into amazon_bestseller_category(category_id,rank,review,detail_link,price,is_active,create_time) values(?,?,?,?,?,?,?)"
                params = (self.category_id,rank,review,link,price,'1',create_time)
                self.db.query(sql, params)
    
    
    if __name__ == "__main__":
        script_path = os.path.split( os.path.realpath( sys.argv[0] ) )[0]
        os.chdir(script_path)
        start_at = time.time()
    
        if not os.path.exists('./data'):
            os.mkdir('./data')
    
        parser = argparse.ArgumentParser(description='Arguments')
        parser.add_argument('--page_url', help='Page_url', required=True)
        parser.add_argument('--rank_max', help='rank_max', required=False, default='20')
        args = vars(parser.parse_args())
        page_url = args['page_url']
        rank_max = args['rank_max']
        params = {'page_url':page_url,'rank_max':rank_max}
        CrawlSettings = {
            'BOT_NAME': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
            'DOWNLOAD_TIMEOUT' : 60,
            'DOWNLOAD_DELAY': 3,
            'DEFAULT_REQUEST_HEADERS': {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'en',
            },
            'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
        }
        process = CrawlerProcess(CrawlSettings)
        process.crawl(AmazonBestsellerCategorySpider,**params)
        process.start()
    
        end_at = time.time()
        print( end_at-start_at)
    

    相关文章

      网友评论

          本文标题:Sqlite3数据库使用

          本文链接:https://www.haomeiwen.com/subject/xuoxxqtx.html