本教程参考视频:

学术网站获取bib格式文件(例如scoups,web of science,google scholar等)
查看sci-hub网站,调用格式,获取网站url(打开网页开发者模式,F12)
编写函数,获取导入bib文件,并格式化字段输出
编写获取文章下载链接函数(定义方法,以题目查找,以题目查找或者以文章DOI查找)
编写下载PDF函数,以及主函数
步骤详解:
一、获取bib文件,这里以scoups网站为例:
https://www.scopus.com/
(1)以关键字搜索获取文献列表

(2)设置搜索范围,勾选全部,点击导出BibTex

(3)设置导出字段,点击bib格式,点击导出

二、查看sci-hub网站,调用格式,获取网站url(打开网页开发者模式,F12)
https://sci-hub.se/
(1)以题目搜索

(2)在网络,监听事件,点击响应,返回正确之后,在点击标题获取请求方法
三、编写函数,获取导入bib文件,并格式化字段输出
(1)bib文件格式预览
注意:这里bib文件格式里的关键字可能不一样,如果在不同网站导出的话,这里的以scoups为例下载的,其他网站需要对"into_bib"的正则化部分做相应的修改

(2)参考代码
def into_bib(file_tix_in):
"""
Import the bib file and output the paper information
----------------------
Input: bib file address
----------------------
Output: Bib file matches
"""
file = open(file_tix_in, mode='r', encoding='utf-8')
lines = ""
for line in file.readlines():
line = line.replace('\n', '-')
lines = lines + line
lines = re.sub(r'(\s \s)', ' ', lines)
pattern_author = re.compile(r'-author={([A-Z -][^\s,]+)', re.I)
pattern_year = re.compile(r'-year={([0-9]+)')
pattern_doi = re.compile(r'-doi={(?!})(?!{)([a-zA-Z0-9 /. - ()]+)')
pattern_title = re.compile(r'-title={(?!})(?!{)([a-zA-Z0-9 \-:\s \']+)')
match_author = pattern_author.findall(lines)
match_year = pattern_year.findall(lines)
match_doi = pattern_doi.findall(lines)
match_title = pattern_title.findall(lines)
match = [match_author, match_year, match_doi, match_title]
return match 四、编写获取文章下载链接函数
参考代码
注意:获取下载链接的代码,会根据网站变的,网站更新需要做相应的变动
如果 v_1 有问题,尝试用 v_2 运行
############################ v_1 ############################
def search_paper(artName):
"""
Search papers
---------------
Input: the name of paper
---------------
Output: search results (if "" is not returned, otherwise PDF link is returned)
"""
url = 'https://www.sci-hub.ren/'
# url = 'https://click.endnote.com/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/x-www-form-urlencoded',
'Content-Length': '123',
'Origin': 'https://www.sci-hub.ren',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'}
data = {'sci-hub-plugin-check': '',
'request': artName}
res = requests.post(url, headers=headers, data=data)
html = res.text
soup = BeautifulSoup(html, 'html.parser')
try:
iframe = soup.find(id='buttons')
tem_out = iframe.contents
downUrl_out = re.findall(r'href=\'([^"]*)', str(tem_out))
downUrl_out = url + downUrl_out[0]
except:
return None
return downUrl_out
############################ v_2 ############################
from lxml import etree
def search_paper(artName):
"""
Search papers
---------------
Input: the name of paper
---------------
Output: search results (if "" is not returned, otherwise PDF link is returned)
"""
url = 'https://www.sci-hub.ren/'
# url = 'https://click.endnote.com/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/x-www-form-urlencoded',
'Content-Length': '123',
'Origin': 'https://www.sci-hub.ren',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'}
data = {'sci-hub-plugin-check': '',
'request': artName}
res = requests.post(url, headers=headers, data=data)
html = res.text
tree = etree.HTML(html)
try:
url = tree.xpath("//*[@id='buttons']/button/@onclick")
url_d = 'https://sci-hub.se/'
downUrl_out = url_d + url[0].split("'")[1]
except:
return None
return downUrl_out
五、编写下载PDF函数,以及主函数
注意:如果用doi方式下载文献,bib文件中不允许出现没有doi的参考文献,file_tix 为bib文件的路径,save_tix 为paper的保存路劲,保存方式为作者加年份
参考代码
(1)下载PDF
def download_paper(downUrl_in):
"""
Download the paper according to the paper link
----------------------
Input: paper link
----------------------
Output: PDF binary files
"""
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'}
res = requests.get(downUrl_in, headers=headers)
return res.content (2)主函数
if __name__ == '__main__':
# bib file address
file_tix = r"bib\scopus.bib"
# File storage address
save_tix = r"paper\\"
if not os.path.exists(save_tix):
os.makedirs(save_tix)
# find_way = 2,it's by DOI. find_way = 3,it's by Title
find_way = 3
paper_find = into_bib(file_tix)
print("Bib contains {num} papers".format(num=len(paper_find[1])))
if find_way is 2 and len(paper_find[1]) != len(paper_find[2]):
print("Records contain missing BOI records. Please select the search method as paper title search, "
"or complete (delete) the missing DOI records")
sys.exit()
download_code = []
for tix_num in range(len(paper_find[1])):
print('NO.{num} Searching...'.format(num=tix_num + 1))
downUrl = search_paper(paper_find[find_way][tix_num])
if downUrl is None:
print('NO.{num} Not found!'.format(num=tix_num + 1))
download_code.append(tix_num + 1)
else:
print('NO.{num} Paper link:{paper_link}'.format(
num=tix_num + 1, paper_link=downUrl))
print('Downloading...')
pdf = download_paper(downUrl)
paper_name = paper_find[0][tix_num] + paper_find[1][tix_num]
with open('%s.pdf' % (save_tix + paper_name), 'wb') as f:
f.write(pdf)
print('---Download complete---')
time.sleep(0.8)
print("The papers records not found are NO.{num}".format(
num=download_code))
print("The title of papers that was not found are:")
for ii in range(len(download_code)):
print("NO.{num}: {title}".format(
num=download_code[ii], title=paper_find[3][download_code[ii] - 1])) 六、整体参考代码
参考代码
# -*- coding: utf-8 -*-
"""
The program is used to download papers in batch.
The input data is BibTex file.
The program has two search methods:
1. search according to the title of the paper
2. search according to the DOI number of the paper
September 2022/02/15 python 3.6
"""
import time
import re
import requests
from lxml import etree
import sys
import os
def search_paper(artName):
"""
Search papers
---------------
Input: the name of paper
---------------
Output: search results (if "" is not returned, otherwise PDF link is returned)
"""
url = 'https://www.sci-hub.ren/'
# url = 'https://click.endnote.com/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/x-www-form-urlencoded',
'Content-Length': '123',
'Origin': 'https://www.sci-hub.ren',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'}
data = {'sci-hub-plugin-check': '',
'request': artName}
res = requests.post(url, headers=headers, data=data)
html = res.text
tree = etree.HTML(html)
try:
url = tree.xpath("//*[@id='buttons']/button/@onclick")
url_d = 'https://sci-hub.se/'
downUrl_out = url_d + url[0].split("'")[1]
except:
return None
return downUrl_out
def download_paper(downUrl_in):
"""
Download the paper according to the paper link
----------------------
Input: paper link
----------------------
Output: PDF binary files
"""
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'}
res = requests.get(downUrl_in, headers=headers)
return res.content
def into_bib(file_tix_in):
"""
Import the bib file and output the paper information
----------------------
Input: bib file address
----------------------
Output: Bib file matches
"""
file = open(file_tix_in, mode='r', encoding='utf-8')
lines = ""
for line in file.readlines():
line = line.replace('\n', '-')
lines = lines + line
lines = re.sub(r'(\s \s)', ' ', lines)
pattern_author = re.compile(r'-author={([A-Z -][^\s,]+)', re.I)
pattern_year = re.compile(r'-year={([0-9]+)')
pattern_doi = re.compile(r'-doi={(?!})(?!{)([a-zA-Z0-9 /. - ()/\//]+)')
pattern_title = re.compile(r'-title={(?!})(?!{)([a-zA-Z0-9 \-:\s \']+)')
# pattern_author = re.compile(r'-Author = {([A-Z -][^\s,]+)', re.I)
# pattern_year = re.compile(r'-Year = {{([0-9]+)')
# pattern_doi = re.compile(r'-DOI = {{(?!})(?!{)([a-zA-Z0-9 /. - ()/\//]+)')
# pattern_title = re.compile(r'-Title = {{(?!})(?!{)([a-zA-Z0-9 \-:\s \']+)')
match_author = pattern_author.findall(lines)
match_year = pattern_year.findall(lines)
match_doi = pattern_doi.findall(lines)
match_title = pattern_title.findall(lines)
match = [match_author, match_year, match_doi, match_title]
return match
if __name__ == '__main__':
# bib file address
file_tix = r"bib\scopus.bib"
# File storage address
save_tix = r"paper\\"
if not os.path.exists(save_tix):
os.makedirs(save_tix)
# find_way = 2,it's by DOI. find_way = 3,it's by Title
find_way = 2
paper_find = into_bib(file_tix)
print("Bib contains {num} papers".format(num=len(paper_find[1])))
if find_way is 2 and len(paper_find[1]) != len(paper_find[2]):
print("Records contain missing BOI records. Please select the search method as paper title search, "
"or complete (delete) the missing DOI records")
# sys.exit()
download_code = []
for tix_num in range(len(paper_find[1])):
print('NO.{num} Searching...'.format(num=tix_num + 1))
downUrl = search_paper(paper_find[find_way][tix_num])
if downUrl is None:
print('NO.{num} Not found!'.format(num=tix_num + 1))
download_code.append(tix_num + 1)
else:
print('NO.{num} Paper link:{paper_link}'.format(
num=tix_num + 1, paper_link=downUrl))
print('Downloading...')
pdf = download_paper(downUrl)
paper_name = paper_find[0][tix_num] + paper_find[1][tix_num]
with open('%s.pdf' % (save_tix + paper_name), 'wb') as f:
f.write(pdf)
print('---Download complete---')
time.sleep(0.8)
print("The papers records not found are NO.{num}".format(
num=download_code))
print("The title of papers that was not found are:")
for ii in range(len(download_code)):
print("NO.{num}: {title}".format(
num=download_code[ii], title=paper_find[3][download_code[ii] - 1]))