介绍
该项目自动获取最新Arxiv上的NLP论文,并生成markdown表格,包括论文题目,作者,链接,论文的开源代码等信息。
项目开源地址:https://github.com/JackHCC/Arxiv-NLP-Reporter
网页阅读:blog.creativecc.cn/arxiv-nlp-reporter
配置好github action,每日自动更新论文并生成Github Page
![](https://img.haomeiwen.com/i14093662/f613d69e2cce6533.png)
代码
import datetime
import requests
import json
import arxiv
import re
base_url = "https://arxiv.paperswithcode.com/api/v0/papers/"
def del_unicode(string):
string = re.sub(r'\\u.{4}', '', string.__repr__())
return string
def del_not_english(string):
string = re.sub('[^A-Za-z]', '', string.__str__())
return string
def get_authors(authors, first_author=False):
output = str()
if first_author == False:
output = ", ".join(str(author) for author in authors)
else:
output = authors[0]
return output
def sort_papers(papers):
output = dict()
keys = list(papers.keys())
keys.sort(reverse=True)
for key in keys:
output[key] = papers[key]
return output
def get_daily_papers(topic, query="nlp", max_results=2):
"""
@param topic: str
@param query: str
@return paper_with_code: dict
"""
# output
content = dict()
content_to_web = dict()
# content
output = dict()
search_engine = arxiv.Search(
query=query,
max_results=max_results,
sort_by=arxiv.SortCriterion.SubmittedDate
)
cnt = 0
for result in search_engine.results():
paper_id = result.get_short_id()
paper_title = result.title
paper_url = result.entry_id
code_url = base_url + paper_id
paper_abstract = result.summary.replace("\n", " ")
paper_authors = get_authors(result.authors)
paper_first_author = get_authors(result.authors, first_author=True)
primary_category = result.primary_category
publish_time = result.published.date()
update_time = result.updated.date()
print("Time = ", update_time,
" title = ", paper_title,
" author = ", paper_first_author)
# eg: 2108.09112v1 -> 2108.09112
ver_pos = paper_id.find('v')
if ver_pos == -1:
paper_key = paper_id
else:
paper_key = paper_id[0:ver_pos]
try:
r = requests.get(code_url).json()
# source code link
if "official" in r and r["official"]:
cnt += 1
repo_url = r["official"]["url"]
content[
paper_key] = f"|**{update_time}**|**{paper_title}**|{paper_first_author} et.al.|[{paper_id}]({paper_url})|**[link]({repo_url})**|\n"
content_to_web[
paper_key] = f"- **{update_time}**, **{paper_title}**, {paper_first_author} et.al., [PDF:{paper_id}]({paper_url}), **[code]({repo_url})**\n"
else:
content[
paper_key] = f"|**{update_time}**|**{paper_title}**|{paper_first_author} et.al.|[{paper_id}]({paper_url})|null|\n"
content_to_web[
paper_key] = f"- **{update_time}**, **{paper_title}**, {paper_first_author} et.al., [PDF:{paper_id}]({paper_url})\n"
except Exception as e:
print(f"exception: {e} with id: {paper_key}")
data = {topic: content}
data_web = {topic: content_to_web}
return data, data_web
def update_json_file(filename, data_all):
with open(filename, "r") as f:
content = f.read()
if not content:
m = {}
else:
m = json.loads(content)
json_data = m.copy()
# update papers in each keywords
for data in data_all:
for keyword in data.keys():
papers = data[keyword]
if keyword in json_data.keys():
json_data[keyword].update(papers)
else:
json_data[keyword] = papers
with open(filename, "w") as f:
json.dump(json_data, f)
def json_to_md(filename, to_web=False):
"""
@param filename: str
@return None
"""
DateNow = datetime.date.today()
DateNow = str(DateNow)
DateNow = DateNow.replace('-', '.')
with open(filename, "r") as f:
content = f.read()
if not content:
data = {}
else:
data = json.loads(content)
if to_web == False:
md_filename = "README.md"
else:
md_filename = "./docs/index.md"
# clean README.md if daily already exist else create it
with open(md_filename, "w+") as f:
pass
# write data into README.md
with open(md_filename, "a+", encoding='utf-8') as f:
if to_web == True:
f.write("---\n" + "layout: default\n" + "---\n\n")
f.write("## Updated on " + DateNow + "\n\n")
for keyword in data.keys():
day_content = data[keyword]
if not day_content:
continue
# the head of each part
f.write(f"## {keyword}\n\n")
if to_web == False:
f.write("|Publish Date|Title|Authors|PDF|Code|\n" + "|---|---|---|---|---|\n")
else:
f.write("| Publish Date | Title | Authors | PDF | Code |\n")
f.write("|:---------|:-----------------------|:---------|:------|:------|\n")
# sort papers by date
day_content = sort_papers(day_content)
for _, v in day_content.items():
if v is not None:
f.write(v)
f.write(f"\n")
print("finished")
if __name__ == "__main__":
data_collector = []
data_collector_web = []
keywords = dict()
keywords["NLP"] = "NLP" + "OR" + "\"Natural Language Processing\""
keywords["Sequence Annotation"] = "\"Sequence Annotation\"OR\"Sequence Marking\"OR\"Named Entity Recognition\""
keywords["Text Classification"] = "\"Text Classification\"OR\"Sentiment Analysis\"OR\"Topic Labeling\"OR\"News Classification\"OR\"Question Answering\"OR\"Dialog Act Classification\"OR\"Natural Language Inference\"OR\"Relation Classification\"OR\"Event Prediction\""
keywords["Information Extraction"] = "\"Information Extraction\"OR\"Automatic Summary\"OR\"Title Generation\"OR\"Event Extraction\""
keywords["Recommendation System"] = "\"Recommendation System\"OR\"Semantic Matching\"OR\"Chatbots\"OR\"Knowledge Graph\"OR\"Knowledge Graphs\""
keywords["GNN"] = "GNN" + "OR" + "\"Recommendation System\"OR\"Graph Neural Network\""
for topic, keyword in keywords.items():
# topic = keyword.replace("\"","")
print("Keyword: " + topic)
data, data_web = get_daily_papers(topic, query=keyword, max_results=10)
data_collector.append(data)
data_collector_web.append(data_web)
print("\n")
# update README.md file
json_file = "nlp-arxiv-daily.json"
# if ~os.path.exists(json_file):
# with open(json_file,'w')as a:
# print("create " + json_file)
# update json data
update_json_file(json_file, data_collector)
# json data to markdown
json_to_md(json_file)
# update docs/index.md file
json_file = "./docs/nlp-arxiv-daily-web.json"
# if ~os.path.exists(json_file):
# with open(json_file,'w')as a:
# print("create " + json_file)
# update json data
update_json_file(json_file, data_collector)
# json data to markdown
json_to_md(json_file, to_web=True)
workflow配置
# This is a basic workflow to help you get started with Actions
name: Run Arxiv Papers Daily
# Controls when the workflow will run
on:
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
schedule:
- cron: "* 0/12 * * *" #'*/60 * * * *'
# Triggers the workflow on push or pull request events but only for the main branch
# push:
# branches:
# - main
env:
GITHUB_USER_NAME: JackHCC
GITHUB_USER_EMAIL: jackcc0701@163.com
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
# This workflow contains a single job called "build"
build:
name: update
# The type of runner that the job will run on
runs-on: ubuntu-latest
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Set up Python Env
uses: actions/setup-python@v1
with:
python-version: 3.6
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install arxiv
pip install requests
- name: Run arxiv report
run: |
python arxiv-report.py
- name: Push new nlp-arxiv-daily.md
uses: github-actions-x/commit@v2.8
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
commit-message: "Github Action Automatic Update NLP Arxiv Papers"
files: README.md nlp-arxiv-daily.json docs/nlp-arxiv-daily-web.json docs/index.md
rebase: 'true'
name: ${{ env.GITHUB_USER_NAME }}
email: ${{ env.GITHUB_USER_EMAIL }}
代码地址:https://github.com/JackHCC/Arxiv-NLP-Reporter
主页:JackHCC
网友评论