Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 179 lines (145 sloc) 5.61 KB
#!/usr/bin/python3
import os
import time
import traceback
import re
MAX_FILENAME_LEN = 60
MAX_KEYWORD_LEN = 20
os.chdir(os.path.dirname(os.path.realpath(__file__)))
srcDir = os.path.expanduser("~/Desktop")
def printErr(err):
with open(srcDir+"/paper-collector-error.txt", "w") as f:
f.write("Could not import paper:\n"+err)
raise RuntimeError("Unexpected error.")
def joinSegmensTillLength(length, maxSegmentLen = 100):
res = ""
for s in segments:
s = re.sub("[^a-z0-9]+", "", s)
if len(s[:maxSegmentLen]) + len(res) + 1 > length:
break
elif res == "":
res += s[:maxSegmentLen]
else:
res += "-"+s[:maxSegmentLen]
return res
while True:
PREFIX = time.strftime("%Y_%m/")
#--------------------------------------------------------------------------------------
# Collect nature papers
try:
pdfList = []
risList = []
for elem in os.listdir(srcDir):
if elem.endswith(".pdf"):
pdfList.append(elem)
if elem.endswith(".ris"):
risList.append(elem)
if len(risList) == 1 and len(pdfList) == 1:
os.rename(srcDir+"/"+risList[0], srcDir+"/"+pdfList[0][:-4]+".ris")
risList[0] = pdfList[0][:-4]+".ris"
for pdf in pdfList:
key = ".".join(pdf.split(".")[:-1])
if key+".ris" in risList:
entries = dict()
entries['authors']=list()
for line in open(srcDir+"/"+key+".ris"):
if re.match("PY",line):
entries['year'] = line[6:10]
elif re.match("AU",line):
entries['authors'].append(line[6:-1])
elif re.match("VL",line):
entries['volume'] = line[6:-1]
elif re.match("TI",line):
entries['title'] = line[6:-1]
elif re.match("T1",line):
entries['title'] = line[6:-1]
elif re.match("JA",line):
entries['journal'] = line[6:-1]
elif re.match("IS",line):
entries['number'] = line[6:-1]
elif re.match("SP",line):
entries['startpage'] = line[6:-1]
elif re.match("EP",line):
entries['endpage'] = line[6:-1]
elif re.match("SN",line):
entries['isbn'] = line[6:-1]
elif re.match("AB",line):
entries['abstract'] = line[6:-1]
elif re.match("UR",line):
entries['url'] = line[6:-1]
with open(srcDir+"/"+key+".bib", 'w') as bib:
bib.write('@article{' + key + ',\n'
+ ' author = "' + (" and ".join(entries['authors'])) + '",\n'
+ ' year = "' + entries['year'] + '",\n'
+ ' title = "' + entries['title'] + '",\n'
+ ('abstract' in entries) * (' abstract = "' + entries.get('abstract', '') + '",\n')
+ ('journal' in entries) * (' journal = "' + entries.get('journal', '') + '",\n')
+ ('volume' in entries) * (' volume = "' + entries.get('volume', '') + '",\n')
+ ('startpage' in entries and 'endpage' in entries)
* (' pages = "' + entries.get('startpage', '')
+ "--" + entries.get('endpage', '') + '",\n')
+ ' url = "' + entries['url'] + '",\n'
+ '}\n')
os.remove(srcDir+"/"+key+".ris")
break
except:
traceback.print_exc()
raise
#--------------------------------------------------------------------------------------
# Collect other papers
try:
pdfList = []
bibList = []
for elem in os.listdir(srcDir):
if elem.endswith(".pdf"):
pdfList.append(elem)
if elem.endswith(".bib") or elem.endswith(".bibtex") :
bibList.append(elem)
if len(pdfList) == 1 and len(bibList) == 1:
pdfFile = srcDir+"/"+pdfList[0]
bibFile = srcDir+"/"+bibList[0]
if abs(os.stat(pdfFile).st_mtime -
os.stat(bibFile).st_mtime) < 1e99* 60*60:
text = open(bibFile, "r").read()
sections = re.findall("@\S+\s*{.*\stitle\s*=\s*(.*)}", text, re.S)
if len(sections) == 1:
if sections[0][0] == "{":
endChar = "}"
elif sections[0][0] == '"':
endChar = '"'
else:
printErr("unexpected delimiter in title.")
t = sections[0]
if endChar == "}":
pos = 0
opens = 0
while True:
if t[pos] == "{":
opens += 1
elif t[pos] == "}":
opens -= 1
if opens == 0:
break
pos += 1
title = t[1:pos].lower()
else:
title = t[1:t.find(endChar,t.find(endChar)+1)].lower()
segments = re.split("\s+", title)
keyword = joinSegmensTillLength(MAX_KEYWORD_LEN, 5)
filename = joinSegmensTillLength(MAX_FILENAME_LEN-len(PREFIX))
m = re.match(".*@(\S+)\s*{\s*(\S+)\s*,.*", text, re.S)
paperType = m.group(1).lower()
text = text[:m.start(2)] + keyword + text[m.end(2):]
targetPath = "./"+paperType+"/"+PREFIX+filename+".pdf"
os.makedirs(os.path.dirname(targetPath), exist_ok=True)
os.rename(pdfFile, targetPath)
with open("./"+paperType+"/"+PREFIX+filename+".bib", "w") as f:
f.write(text.strip()+"\n")
os.remove(bibFile)
else:
printErr("expected only one section in .bib file.")
else:
printErr("found .bib and .pdf but ignored them because timestamps differ to much.")
except:
traceback.print_exc()
time.sleep(10)