简介
由于原先使用的oj是Vijos的一个域(https://vijos.org/d/nnu_contest/),因此希望可以将原先的题目的题面和数据下载下来,移植到新的oj上面。
由于题目较多,约有两百道左右,所以写了一个爬虫爬取题目和数据。
github链接:https://github.com/LPJworkroom/BuddyOJ/tree/master/assist_tool/vijos_prob_spider
使用
python3 Vijos_prob_Spider.py
效果
会在当前文件夹下创建prob文件夹
其中包括vijos中nnu_contest域中的所有可以下载数据的题目
(PS:由于未知原因,爬取过程中,有极少部分题目无法爬取)
(PPS:需要改动代码中的用户名和密码)
代码如下
# -*- coding:utf-8 -*-
import requests
import json
import re
import time
import os
cnt=1000
flag=False
savepath="./prob/"
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Inter Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/52.0.2743.116 Safari/537.36 '
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except requests.RequestException:
return None
def parse_one_page(html):
'''
pattern = re.compile(
'<span class="time" data-timestamp="\d+">(.*?)</span>'
+ '.*?timestamp="\d+">(.*?)</span>.*?<a href="(.*?)" data-emoji-enabled>(.*?)</a>.*?'
+ 'icon-award"></span>(.*?)</a>.*?text-blue">'
+ '</span>(.*?)</li>.*?</span>(.*?)</li>',
re.S
)
'''
# pattern = re.compile(r'href="(.*?)".*?<span class="problem__rp-tag">',re.S)
pattern = re.compile(r'class="col--name col--problem-name".*?href="(.*?)"', re.S)
items = re.findall(pattern, html)
urls=[]
for item in items:
urls.append("https://vijos.org"+item)
return urls
def mkdir(path):
path = path.strip()
if not os.path.exists(path):
os.makedirs(path)
return True
else:
return False
def login(url):
global flag
global cnt
global savepath
flag=True
prob_link = requests.get(url)
title = re.findall(r'class="section__header non-scratchpad--hide".*?<h1>(.*?)</h1>', prob_link.text, re.S)
if len(title) == 0: # 针对无数据题目
flag=False
return
savepath = './prob/' + str(cnt)
mkdir(savepath)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Inter Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/52.0.2743.116 Safari/537.36 '
}
data = {
'uname': '******', #需要改动
'password': '*******' #需要改动
}
loginurl = 'https://vijos.org/d/nnu_contest/login'
session = requests.session()
session.post(loginurl, data=data)
return session
def get_prob(session,url):
global cnt
prob_link = requests.get(url)
content = re.findall(r'<div class="section__body typo">(.*?)</div>', prob_link.text, re.S)
title = re.findall(r'class="section__header non-scratchpad--hide".*?<h1>(.*?)</h1>', prob_link.text, re.S)
cate=re.findall(r'class="problem__tags".*?href=".*?">(.*?)</a>', prob_link.text, re.S)
now_text=content[0]
desc = re.findall(r'<blockquote>.*?<p>(.*?)</p>.*?</blockquote>',now_text,re.S)
if len(desc)==0:
print("get description failed.now title:")
print(title[0])
return
desc_out = re.sub(r'<br>', "", desc[0])
data = re.findall(r'<pre><code class="language-cpp">(.*?)</code></pre>', now_text, re.S)
input_data=[]
output_data=[]
for i,item in enumerate(data):
if i%2==0:
input_data.append(item)
else:
output_data.append(item)
with open(savepath + '/' + 'description.txt', 'w',encoding='utf-8') as f:
f.write(desc_out)
tot=1
for item in input_data:
file_name="samplein"+str(tot)
with open(savepath + '/' + file_name +'.txt', 'w',encoding='utf-8') as f:
f.write(item)
tot+=1
tot=1
for item in output_data:
file_name="sampleout"+str(tot)
with open(savepath + '/' + file_name +'.txt', 'w',encoding='utf-8') as f:
f.write(item)
tot+=1
with open(savepath + '/' + 'info.txt', 'w',encoding='utf-8') as f:
f.write("1\n")
f.write(str(cnt))
f.write("\n")
f.write(title[0])
f.write("\n")
f.write("1\n")
f.write("1000\n")
f.write("256")
with open(savepath + '/' + 'cate.txt', 'w',encoding='utf-8') as f:
for item in cate:
f.write(item)
f.write("\n")
return
def get_data(session,url):
global cnt
response = session.get(url + '/data')
with open(savepath + '/' + str(cnt) + '.zip', 'wb') as f:
f.write(response.content)
def main(offset):
global cnt
url = 'https://vijos.org/d/nnu_contest/p?page=' + str(offset)
html = get_one_page(url)
# print(html)
items=parse_one_page(html)
# print(items)
# print(len(items))
for item in items:
session=login(item)
if (flag == True):get_data(session,item)
if (flag == True):get_prob(session,item)
print("successfully download the porblem")
print(cnt)
print("---------------")
cnt += 1
# break
if __name__ == '__main__':
mkdir('./prob')
for i in range(3):
# if i==0:continue
# if i==1:continue
main(i + 1)
time.sleep(1)
网友评论