Skip to content
Permalink
392d3d00e4
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
318 lines (292 sloc) 10.4 KB
from eoa.models import *
from collections import OrderedDict
from bs4 import BeautifulSoup
import re
####################
# Importer class
####################
class TEIimporter():
"""
Importer for TEI documents to python objects.
"""
def __init__(self, data):
self.soup = BeautifulSoup(open(data), features="xml")
self.header = self.soup.findAll('teiHeader')[0]
self.body = self.soup.findAll('body')[0]
def publication(self):
res = {}
seriesInfo = [(x.attrs,x.contents[0]) for x in self.header.findAll('title',{'level':'s'})]
res['title'] = [x.contents[0] for x in self.header.findAll('title',{'type':'main'})][0]
try:
res['subtitle'] = [x.contents[0] for x in self.header.findAll('title',{'type':'sub'})][0]
except:
pass
res['series'] = seriesInfo[0][1].lower()
res['publication_id'] = int(seriesInfo[0][0]['n'])
#
res['publisher'] = self.header.findAll('publisher')[0].contents[0]
res['created_date'] = self.header.findAll('date')[0].contents[0]
res['pages'] = 1
res['price'] = 1
return res
def authors(self):
resTemp = [x.contents[0] for x in self.header.findAll('author')]
auts = [x.split(' ') for x in resTemp]
ret = []
for aut in auts:
au = {}
if len(aut) > 2:
au['firstname'] = aut[0]
au['lastname'] = aut[-1]
au['middlenames'] = ' '.join(aut[1:-1])
elif len(aut) == 2:
au['firstname'] = aut[0]
au['lastname'] = aut[1]
elif len(aut) == 1:
au['lastname'] = aut[0]
ret.append(au)
return ret
def getContent(self,obj):
conList = []
for k in range(len(obj)):
try:
parent = obj[k].parent.attrs['xml:id']
except:
#print(obj[k])
pass
for i, con in enumerate(obj[k].contents):
temp = {}
temp['parentid'] = parent
try:
temp['position'] = i
temp['paragraphid'] = con.parent.attrs['xml:id']
except:
pass
if type(con).__name__ == 'Tag':
temp['tagname'] = re.findall('(?<=<)\w+(?=\s)',con.decode())[0]
temp['attributes'] = con.attrs
temp['text'] = self.getContent([con])
elif type(con).__name__ == 'NavigableString':
temp['text'] = str(con).strip()
conList.append(temp)
return conList
def findPart(self,part,keyname):
ret = []
parts = self.body.findAll('div',{'type':part})
for part in parts:
pt = {}
titles = part.findAll('head')
for title in titles:
if title.parent.attrs['xml:id'] == part.attrs['xml:id']:
pt['title'] = title.contents
pt['idstring'] = part.attrs['xml:id']
try:
pt['order'] = part.attrs['n']
except:
pass
try:
pt['number'] = part.attrs['o']
except:
pass
try:
pt[keyname] = part.parent.attrs['xml:id']
except:
pt[keyname] = ''
pass
ret.append(pt)
return ret
def chapters(self):
return self.findPart('chapter','part')
def sections(self):
return self.findPart('section','chapter')
def subsections(self):
return self.findPart('subsection','section')
def subsubsections(self):
return self.findPart('subsubsection','subsection')
def paragraphs(self):
paras = self.body.findAll('p')
return self.getContent(paras)
def citations(self):
res = []
for bib in self.body.findAll('biblStruct'):
tempB = {}
auth = []
for author in bib.findAll('author'):
tempA = {}
tempA['lastname'] = author.findAll('surname')[0].contents[0]
tempA['firstname'] = author.findAll('forename')[0].contents[0]
auth.append(tempA)
edit = []
for editor in bib.findAll('editor'):
tempA = {}
tempA['lastname'] = editor.findAll('surname')[0].contents[0]
tempA['firstname'] = editor.findAll('forename')[0].contents[0]
edit.append(tempA)
try:
tempB['pubPlace'] = bib.findAll('pubPlace')[0].contents[0]
except:
pass
try:
tempB['publisher'] = bib.findAll('publisher')[0].contents[0]
except:
pass
tempB['pubtype'] = bib.attrs['type']
tempB['idstring'] = bib.attrs['xml:id']
tempB['date'] = bib.findAll('date')[0].contents[0]
notes = ''
for notepart in bib.findAll('note'):
notes += notepart.text.strip()
tempB['note'] = notes
for title in bib.findAll('title'):
if title.attrs['level'] == 'a':
tempB['titlea'] = title.contents[0]
elif title.attrs['level'] in ['m','j']:
tempB['titlem'] = title.contents[0]
for scope in bib.findAll('biblScope'):
tempB[scope.attrs['unit']] = scope.contents[0]
res.append((('authors',auth),('editors',edit),tempB))
return res
########################
#
# Helper functions to write to db
#
########################
def createPublication(text):
params = text.publication()
pubTemp, created = Publication.objects.update_or_create(
**params
)
return pubTemp
def createAuthors(text,publication):
autList = []
for author in text.authors():
autTemp, created = Author.objects.update_or_create(
**author
)
autTemp.publications.add(publication)
autList.append(autTemp)
return autList
def createChapters(text,publication):
chptList = []
for chapter in text.chapters():
parent = chapter.pop('part')
#print(parent)
chapter['publication'] = publication
chpTemp, created = Chapter.objects.update_or_create(
**chapter
)
chptList.append(chpTemp)
return chptList
def createSections(text,publication):
secList = []
for section in text.sections():
try:
full = section
chapID = section.pop('chapter')
chapter = Chapter.objects.get(idstring=chapID)
section['publication'] = publication
section['chapter'] = chapter
secTemp, created = Section.objects.update_or_create(
**section
)
secList.append(secTemp)
except:
print('Section has no chapter as parent\n {0}'.format(full))
pass
return secList
def createParagraphs(text,publication):
parList = []
for paragraph in text.paragraphs():
#print('{0}{1}{2}'.format('\n',paragraph,'\n'))
mix, created = MixedContent.objects.update_or_create(
position = 0,
paragraphid = paragraph['paragraphid']
)
if 'attributes' in paragraph.keys():
mix2, created = MixedContent.objects.update_or_create(
parent = mix,
position = paragraph['position'],
tagname = paragraph['tagname'],
attributes = paragraph['attributes']
)
text2, created = Text.objects.update_or_create(
parent = mix2,
position = mix2.position,
text = paragraph['text'],
)
elif 'text' in paragraph.keys():
text, created = Text.objects.update_or_create(
parent = mix,
position = paragraph['position'],
text = paragraph['text']
)
else:
pass
try:
parent = paragraph['parentid']
if parent.startswith('chapter'):
chapter = Chapter.objects.get(idstring=parent)
paraTemp, created = Paragraph.objects.update_or_create(
publication = publication,
idstring = paragraph['paragraphid'],
content = mix,
chapter = chapter,
)
elif parent.startswith('sec'):
section = Section.objects.get(idstring=parent)
paraTemp, created = Paragraph.objects.update_or_create(
publication = publication,
idstring = paragraph['paragraphid'],
content = mix,
section = section,
)
else:
print(parent)
pass
except:
#print('Could not find parent element of: {0}'.format(paragraph))
pass
parList.append(paraTemp)
return parList
def createCitations(text,publication):
citList = []
autList = []
ediList = []
for aut, edi, cit in text.citations():
citTemp, created = Citation.objects.update_or_create(
**cit
)
for author in aut[1]:
autTemp, created = Author.objects.update_or_create(
**author
)
autTemp.citations.add(citTemp)
autList.append(autTemp)
for editor in edi[1]:
ediTemp, created = Author.objects.update_or_create(
**editor
)
ediTemp.editions.add(citTemp)
ediList.append(ediTemp)
return (autList,ediList,cit)
# print(cit['author'])
# try:
# cit['author'].split(' ')
# autTemp, created = Author.objects.update_or_create(
# firstname = cit['author'].split(' ')[1],
# lastname = cit['author'].split(' ')[-1],
# middlename = cit['author'].split(' ')[1:-1],
# )
# except:
# pass
# cit.pop('author')
# citTemp, created = Citation.objects.update_or_create(
# **cit
# )
# try:
# autTemp.citations.add(citTemp)
# autList.append(autTemp)
# except:
# pass
# citList.append(citTemp)
# return citList, autList