国内PIP镜像源
https://pypi.tuna.tsinghua.edu.cn/simple
1. 如果是拨号VPS,设置拨号账号
[root@localhost ~]# PPPoE-Setup
...
[root@localhost ~]# ip address
...
[root@localhost ~]# pppoe-start
...
2.安装Python3.6
[root@localhost ~]# yum install -y epel-release
...
[root@localhost ~]# yum install -y python36
...
[root@localhost ~]# python3 -V
Python 3.6.8
3. 更新pip3为最新版本,并修改默认镜像源
[root@localhost ~]# pip3 install --upgrade pip
Installing collected packages: pip
Successfully installed pip-19.2.3
[root@localhost ~]# mkdir .pip
[root@localhost ~]# cd .pip
[root@localhost .pip]# vi pip.conf
[global]
timeout = 6000
index-url=https://pypi.tuna.tsinghua.edu.cn/simple
trusted-host = pypi.tuna.tsinghua.edu.cn
4. 安装系统依赖工具包
[root@localhost ~]# pip3 install bs4
...
[root@localhost ~]# pip3 install scrapy==1.6.0
...
其他你自己的爬虫需要引用的第三方包
5. 安装MS SQL SERVER 数据库驱动
[root@localhost ~]# curl https://packages.microsoft.com/config/rhel/7/prod.repo > /etc/yum.repos.d/mssql-release.repo
...
[root@localhost ~]# yum install unixODBC -y
[root@localhost ~]# yum install msodbcsql17 -y
...
视情况安装mysql odbc
[root@iZbp108zipn94hwfsywfxyZ ~]# yum install https://www.devart.com/odbc/mysql/devart-odbc-mysql.x86_64.rpm
6. 安装数据库访问组件pyodbc
[root@iZbp108zipn94hwfsywfxyZ ~]# yum install gcc-c++
[root@iZbp108zipn94hwfsywfxyZ ~]# yum install python36-devel
[root@iZbp108zipn94hwfsywfxyZ ~]# yum install unixODBC-devel
...
[root@localhost ~]# pip3 install pyodbc
...
7. 配置并启动Scrapyd服务
[root@localhost scrapyd]# pip3 install scrapyd
...
[root@localhost log]# cd /var/log
[root@localhost log]# mkdir scrapyd
[root@localhost log]# cd scrapyd
[root@localhost scrapyd]# mkdir logs
[root@localhost scrapyd]# mkdir eggs
[root@localhost scrapyd]# mkdir dbs
[root@localhost scrapyd]# cd /etc/scrapyd
没有此路径就创建
[root@localhost scrapyd]# vi scrapyd.conf
[scrapyd]
eggs_dir = /var/log/scrapyd/eggs
logs_dir = /var/log/scrapyd/logs
items_dir =/var/log/scrapyd/items
jobs_to_keep = 5
dbs_dir = /var/log/scrapyd/dbs
max_proc = 0
max_proc_per_cpu = 4
finished_to_keep = 100
poll_interval = 5.0
bind_address = 0.0.0.0
http_port = 63800
debug = off
runner = scrapyd.runner
application = scrapyd.app.application
launcher = scrapyd.launcher.Launcher
webroot = scrapyd.website.Root
...
[root@localhost scrapyd]# nohup scrapyd
...
8. 配置并启动logparser
[root@localhost ~]# pip3 install logparser
...
[root@localhost ~]# cd /usr/local/lib/python3.6/site-packages/logparser
[root@localhost logparser]# vi settings.py
修改以下两处配置
SCRAPYD_LOGS_DIR = '/var/log/scrapyd/logs'
SCRAPYD_SERVER = '127.0.0.1:63800'
[root@localhost ~]# nohup logparser
9. 开放63800端口
添加指定需要开放的端口
[root@localhost ~]# systemctl start firewalld.service
[root@localhost ~]# firewall-cmd --add-port=63800/tcp --permanent
...
重载入添加的端口:
[root@localhost ~]# firewall-cmd --reload
...
查询指定端口是否开启成功:
[root@localhost ~]# firewall-cmd --query-port=63800/tcp
...
10. 检查服务是否启动,并终止任务的运行
[root@localhost ~]# ps -ef|grep scrapyd
...
[root@localhost ~]# ps -ef|grep logparser
...
终止进程
[root@localhost ~]# kill -s 9 4056
...
网友评论