Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
django-tei-importer/publications/management/commands/_importer_class.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
318 lines (292 sloc)
10.4 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from eoa.models import * | |
from collections import OrderedDict | |
from bs4 import BeautifulSoup | |
import re | |
#################### | |
# Importer class | |
#################### | |
class TEIimporter(): | |
""" | |
Importer for TEI documents to python objects. | |
""" | |
def __init__(self, data): | |
self.soup = BeautifulSoup(open(data), features="xml") | |
self.header = self.soup.findAll('teiHeader')[0] | |
self.body = self.soup.findAll('body')[0] | |
def publication(self): | |
res = {} | |
seriesInfo = [(x.attrs,x.contents[0]) for x in self.header.findAll('title',{'level':'s'})] | |
res['title'] = [x.contents[0] for x in self.header.findAll('title',{'type':'main'})][0] | |
try: | |
res['subtitle'] = [x.contents[0] for x in self.header.findAll('title',{'type':'sub'})][0] | |
except: | |
pass | |
res['series'] = seriesInfo[0][1].lower() | |
res['publication_id'] = int(seriesInfo[0][0]['n']) | |
# | |
res['publisher'] = self.header.findAll('publisher')[0].contents[0] | |
res['created_date'] = self.header.findAll('date')[0].contents[0] | |
res['pages'] = 1 | |
res['price'] = 1 | |
return res | |
def authors(self): | |
resTemp = [x.contents[0] for x in self.header.findAll('author')] | |
auts = [x.split(' ') for x in resTemp] | |
ret = [] | |
for aut in auts: | |
au = {} | |
if len(aut) > 2: | |
au['firstname'] = aut[0] | |
au['lastname'] = aut[-1] | |
au['middlenames'] = ' '.join(aut[1:-1]) | |
elif len(aut) == 2: | |
au['firstname'] = aut[0] | |
au['lastname'] = aut[1] | |
elif len(aut) == 1: | |
au['lastname'] = aut[0] | |
ret.append(au) | |
return ret | |
def getContent(self,obj): | |
conList = [] | |
for k in range(len(obj)): | |
try: | |
parent = obj[k].parent.attrs['xml:id'] | |
except: | |
#print(obj[k]) | |
pass | |
for i, con in enumerate(obj[k].contents): | |
temp = {} | |
temp['parentid'] = parent | |
try: | |
temp['position'] = i | |
temp['paragraphid'] = con.parent.attrs['xml:id'] | |
except: | |
pass | |
if type(con).__name__ == 'Tag': | |
temp['tagname'] = re.findall('(?<=<)\w+(?=\s)',con.decode())[0] | |
temp['attributes'] = con.attrs | |
temp['text'] = self.getContent([con]) | |
elif type(con).__name__ == 'NavigableString': | |
temp['text'] = str(con).strip() | |
conList.append(temp) | |
return conList | |
def findPart(self,part,keyname): | |
ret = [] | |
parts = self.body.findAll('div',{'type':part}) | |
for part in parts: | |
pt = {} | |
titles = part.findAll('head') | |
for title in titles: | |
if title.parent.attrs['xml:id'] == part.attrs['xml:id']: | |
pt['title'] = title.contents | |
pt['idstring'] = part.attrs['xml:id'] | |
try: | |
pt['order'] = part.attrs['n'] | |
except: | |
pass | |
try: | |
pt['number'] = part.attrs['o'] | |
except: | |
pass | |
try: | |
pt[keyname] = part.parent.attrs['xml:id'] | |
except: | |
pt[keyname] = '' | |
pass | |
ret.append(pt) | |
return ret | |
def chapters(self): | |
return self.findPart('chapter','part') | |
def sections(self): | |
return self.findPart('section','chapter') | |
def subsections(self): | |
return self.findPart('subsection','section') | |
def subsubsections(self): | |
return self.findPart('subsubsection','subsection') | |
def paragraphs(self): | |
paras = self.body.findAll('p') | |
return self.getContent(paras) | |
def citations(self): | |
res = [] | |
for bib in self.body.findAll('biblStruct'): | |
tempB = {} | |
auth = [] | |
for author in bib.findAll('author'): | |
tempA = {} | |
tempA['lastname'] = author.findAll('surname')[0].contents[0] | |
tempA['firstname'] = author.findAll('forename')[0].contents[0] | |
auth.append(tempA) | |
edit = [] | |
for editor in bib.findAll('editor'): | |
tempA = {} | |
tempA['lastname'] = editor.findAll('surname')[0].contents[0] | |
tempA['firstname'] = editor.findAll('forename')[0].contents[0] | |
edit.append(tempA) | |
try: | |
tempB['pubPlace'] = bib.findAll('pubPlace')[0].contents[0] | |
except: | |
pass | |
try: | |
tempB['publisher'] = bib.findAll('publisher')[0].contents[0] | |
except: | |
pass | |
tempB['pubtype'] = bib.attrs['type'] | |
tempB['idstring'] = bib.attrs['xml:id'] | |
tempB['date'] = bib.findAll('date')[0].contents[0] | |
notes = '' | |
for notepart in bib.findAll('note'): | |
notes += notepart.text.strip() | |
tempB['note'] = notes | |
for title in bib.findAll('title'): | |
if title.attrs['level'] == 'a': | |
tempB['titlea'] = title.contents[0] | |
elif title.attrs['level'] in ['m','j']: | |
tempB['titlem'] = title.contents[0] | |
for scope in bib.findAll('biblScope'): | |
tempB[scope.attrs['unit']] = scope.contents[0] | |
res.append((('authors',auth),('editors',edit),tempB)) | |
return res | |
######################## | |
# | |
# Helper functions to write to db | |
# | |
######################## | |
def createPublication(text): | |
params = text.publication() | |
pubTemp, created = Publication.objects.update_or_create( | |
**params | |
) | |
return pubTemp | |
def createAuthors(text,publication): | |
autList = [] | |
for author in text.authors(): | |
autTemp, created = Author.objects.update_or_create( | |
**author | |
) | |
autTemp.publications.add(publication) | |
autList.append(autTemp) | |
return autList | |
def createChapters(text,publication): | |
chptList = [] | |
for chapter in text.chapters(): | |
parent = chapter.pop('part') | |
#print(parent) | |
chapter['publication'] = publication | |
chpTemp, created = Chapter.objects.update_or_create( | |
**chapter | |
) | |
chptList.append(chpTemp) | |
return chptList | |
def createSections(text,publication): | |
secList = [] | |
for section in text.sections(): | |
try: | |
full = section | |
chapID = section.pop('chapter') | |
chapter = Chapter.objects.get(idstring=chapID) | |
section['publication'] = publication | |
section['chapter'] = chapter | |
secTemp, created = Section.objects.update_or_create( | |
**section | |
) | |
secList.append(secTemp) | |
except: | |
print('Section has no chapter as parent\n {0}'.format(full)) | |
pass | |
return secList | |
def createParagraphs(text,publication): | |
parList = [] | |
for paragraph in text.paragraphs(): | |
#print('{0}{1}{2}'.format('\n',paragraph,'\n')) | |
mix, created = MixedContent.objects.update_or_create( | |
position = 0, | |
paragraphid = paragraph['paragraphid'] | |
) | |
if 'attributes' in paragraph.keys(): | |
mix2, created = MixedContent.objects.update_or_create( | |
parent = mix, | |
position = paragraph['position'], | |
tagname = paragraph['tagname'], | |
attributes = paragraph['attributes'] | |
) | |
text2, created = Text.objects.update_or_create( | |
parent = mix2, | |
position = mix2.position, | |
text = paragraph['text'], | |
) | |
elif 'text' in paragraph.keys(): | |
text, created = Text.objects.update_or_create( | |
parent = mix, | |
position = paragraph['position'], | |
text = paragraph['text'] | |
) | |
else: | |
pass | |
try: | |
parent = paragraph['parentid'] | |
if parent.startswith('chapter'): | |
chapter = Chapter.objects.get(idstring=parent) | |
paraTemp, created = Paragraph.objects.update_or_create( | |
publication = publication, | |
idstring = paragraph['paragraphid'], | |
content = mix, | |
chapter = chapter, | |
) | |
elif parent.startswith('sec'): | |
section = Section.objects.get(idstring=parent) | |
paraTemp, created = Paragraph.objects.update_or_create( | |
publication = publication, | |
idstring = paragraph['paragraphid'], | |
content = mix, | |
section = section, | |
) | |
else: | |
print(parent) | |
pass | |
except: | |
#print('Could not find parent element of: {0}'.format(paragraph)) | |
pass | |
parList.append(paraTemp) | |
return parList | |
def createCitations(text,publication): | |
citList = [] | |
autList = [] | |
ediList = [] | |
for aut, edi, cit in text.citations(): | |
citTemp, created = Citation.objects.update_or_create( | |
**cit | |
) | |
for author in aut[1]: | |
autTemp, created = Author.objects.update_or_create( | |
**author | |
) | |
autTemp.citations.add(citTemp) | |
autList.append(autTemp) | |
for editor in edi[1]: | |
ediTemp, created = Author.objects.update_or_create( | |
**editor | |
) | |
ediTemp.editions.add(citTemp) | |
ediList.append(ediTemp) | |
return (autList,ediList,cit) | |
# print(cit['author']) | |
# try: | |
# cit['author'].split(' ') | |
# autTemp, created = Author.objects.update_or_create( | |
# firstname = cit['author'].split(' ')[1], | |
# lastname = cit['author'].split(' ')[-1], | |
# middlename = cit['author'].split(' ')[1:-1], | |
# ) | |
# except: | |
# pass | |
# cit.pop('author') | |
# citTemp, created = Citation.objects.update_or_create( | |
# **cit | |
# ) | |
# try: | |
# autTemp.citations.add(citTemp) | |
# autList.append(autTemp) | |
# except: | |
# pass | |
# citList.append(citTemp) | |
# return citList, autList |