import time
import random
import re
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
BASE_URL = "https://www.wanplus.com"
EVENT_URL = "%s/event/839.html" % BASE_URL
def find_schedule_pages():
text = requests.get(EVENT_URL).text
hrefs = re.findall("/schedule/\d+.html", text)
return hrefs
def chrome_get_texts(url, min_wait_seconds=1):
print("chrome_get_texts %s" % url)
texts = []
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
time.sleep(min_wait_seconds + min_wait_seconds * random.random())
for i in range(1, 10):
try:
driver.find_element_by_xpath('//*[@id="info"]/div[2]/div[3]/ul/li[%d]' % i).click()
time.sleep(min_wait_seconds + min_wait_seconds * random.random())
texts.append(driver.page_source)
except:
break
return texts
def get_kings_infos(text):
soup = BeautifulSoup(text, 'lxml')
home_team = soup.find("div", attrs={"class", "led_left"}).text.strip()
game_result = soup.find("div", attrs={"class", "las_midd"}).text.strip()
away_team = soup.find("div", attrs={"class", "sna_right"}).text.strip()
if home_team in game_result:
game_result = home_team
elif away_team in game_result:
game_result = away_team
home_bans = [img['alt'] for img in soup.find("div", attrs={"class", "led_left2"}).find_all('img')]
away_bans = [img['alt'] for img in soup.find("div", attrs={"class", "sna_right2"}).find_all('img')]
home_kings = [img['alt'] for img in soup.find("div", attrs={"class", "led_left1"}).find_all('img')]
away_kings = [img['alt'] for img in soup.find("div", attrs={"class", "sna_right1"}).find_all('img')]
return home_team, away_team, game_result, home_bans, away_bans, home_kings, away_kings
def main():
with open('2019kings.csv', 'wb') as f:
for url in find_schedule_pages():
schedule_url = "%s%s" % (BASE_URL, url)
texts = chrome_get_texts(schedule_url)
for text in texts:
home_team, away_team, game_result, home_bans, away_bans, home_kings, away_kings = get_kings_infos(text)
items = [home_team, away_team, game_result] + home_bans + away_bans + home_kings + away_kings
print(items)
f.write(b'%s\n' % ','.join(items).encode("utf-8"))
main()
网友评论