Skip to content
Permalink
f289774945
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
429 lines (404 sloc) 15.4 KB
from eoa.models import *
from collections import OrderedDict
####################
# Importer class
####################
class TEIimporter():
"""
Importer for TEI documents to python objects.
"""
mapList = [] #search path for values
#dataDict = importieren # the document as an ordered dict
wanted_key = '' # search for "wanted_key"
basePath = ['TEI', 'text', 'body', 'div'] # path to find all text parts
def __init__(self, data):
self.dataDict = data
def returnPath(self,keys,dataDict):
res = [list(y) for y in [x for x in self.find_all_with_key(keys[0],dataDict)] if keys[1] in y]
return res
def getFromDict(self,dataDict,mapList):
# Traverse dict
for k in mapList:
dataDict = dataDict[k]
return dataDict
def find_all_with_key(self, wanted_key, dataDict, path=tuple()):
# return paths to key in nested dict
if isinstance(dataDict, list):
for idx, el in enumerate(dataDict):
yield from self.find_all_with_key(wanted_key, el, path+(idx,))
elif isinstance(dataDict, dict):
for k in dataDict:
if k == wanted_key:
yield path +(k, )
# you can add order of width-search by sorting result of tree.items()
for k, v in dataDict.items():
yield from self.find_all_with_key(wanted_key, v, path+(k,))
def title(self):
'''
Returning all information concerning the title fields.
Assumption is, that there is only one title in fileDesc and
the structur:
<title level="s" n="Number">Type</title>
<title type="main" level="m">Main-title</title>
is universal. Subtitle information is optional, but allways of
the form:
<title type="sub" level="m">Sub-title</title>
'''
paths = self.returnPath(['title','fileDesc'],self.dataDict)
res = self.getFromDict(self.dataDict,list(paths[0]))
serie = res[0]['#text']
number = res[0]['@n']
title = res[1]['#text']
if len(res) == 3:
subtitle = res[2]['#text']
else:
subtitle = ''
return (serie,number,title,subtitle)
def authors(self):
"""
Give all authors.
"""
paths = self.returnPath(['author','fileDesc'],self.dataDict)
res = self.getFromDict(self.dataDict,paths[0])
return res
def publisher(self):
"""
Give all authors.
"""
paths = self.returnPath(['publisher','fileDesc'],self.dataDict)
res = self.getFromDict(self.dataDict,paths[0])
return res
def published_date(self):
"""
Give all authors.
"""
paths = self.returnPath(['date','fileDesc'],self.dataDict)
res = self.getFromDict(self.dataDict,paths[0])
return res['#text']
def chapters(self):
"""
Chapters: Returned information is limited to id, counter and title
"""
res = self.getFromDict(self.dataDict,self.basePath)
ret = []
if type(res) == list:
for x in res:
if x['@type'] == 'chapter':
_id = x['@id']
_counter = x['@n']
try:
_title = x['head']['#text']
except:
_title = ''
ret.append((_id,_counter,_title))
elif type(res) == OrderedDict:
x = res
if x['@type'] == 'chapter':
_id = x['@id']
_counter = x['@n']
try:
_title = x['head']['#text']
except:
_title = ''
ret.append((_id,_counter,_title))
return ret
def sections(self):
"""
Sections
"""
ret = []
pathBase = self.basePath
chapters = self.chapters()
for i in range(len(chapters)):
heading = ''
_chapter = chapters[i][0]
if len(chapters) == 1:
pathTemp = pathBase + ['div']
else:
pathTemp = pathBase + [i,'div']
sections = self.getFromDict(self.dataDict,pathTemp)
#section_paths = self.returnPath(['div',i],self.dataDict)
if type(sections) == list:
for section in sections:
try:
heading = section['head']['#text']
except:
pass
ret.append((_chapter,section['@id'],section['@n'],heading))
elif type(sections) == OrderedDict:
try:
heading = sections['head']['#text']
except:
pass
ret.append((_chapter,sections['@id'],sections['@n'],heading))
return ret
def paragraphs(self):
"""
Paragraphs
"""
pathBase = self.basePath #['TEI', 'text', 'body', 'div']
chapters = self.chapters()
sections = self.sections()
ret = []
for i in range(len(chapters)):
_chapter = chapters[i][0]
try:
_section = ''
if len(chapters) == 1:
pathTemp = pathBase + ['p']
else:
pathTemp = pathBase + [i,'p']
paragraphs = self.getFromDict(self.dataDict, pathTemp)
if type(paragraphs) == list:
for paragraph in paragraphs:
ret.append((_chapter,_section,paragraph['@id'],paragraph['#text']))
elif type(paragraphs) == OrderedDict:
ret.append((_chapter,_section,paragraphs['@id'],paragraphs['#text']))
else:
print('Error reading:',_chapter,type(paragraphs),pathTemp)
pass
except:
pass
sectionNumber = len([x for x in sections if x[0] == _chapter])
#print(sectionNumber)
if sectionNumber == 1:
_section = sections[i][1]
if len(chapters) == 1:
pathTemp = pathBase + ['div','p']
else:
pathTemp = pathBase + [i,'div','p']
#pathTemp = pathBase + [i,'div','p']
paragraphs = self.getFromDict(self.dataDict, pathTemp)
if type(paragraphs) == list:
for paragraph in paragraphs:
ret.append((_chapter,_section,paragraph['@id'],paragraph['#text']))
elif type(paragraphs) == OrderedDict:
ret.append((_chapter,_section,paragraphs['@id'],paragraphs['#text']))
else:
print('Error reading:',_chapter,type(paragraphs),pathTemp)
pass
elif sectionNumber > 1:
for k in range(sectionNumber):
_section = sections[k][1]
if len(chapters) == 1:
pathTemp = pathBase + ['div',k,'p']
else:
pathTemp = pathBase + [i,'div',k,'p']
paragraphs = self.getFromDict(self.dataDict, pathTemp)
#print(paragraphs)
if type(paragraphs) == list:
for paragraph in paragraphs:
#print(paragraph)
ret.append((_chapter,_section,paragraph['@id'],paragraph['#text']))
elif type(paragraphs) == OrderedDict:
#print(paragraphs.keys())
ret.append((_chapter,_section,paragraphs['@id'],paragraphs['#text']))
else:
print('Error reading:',_chapter,type(paragraphs),pathTemp)
pass
else:
pass
return ret
def biblio(self):
ret = []
for path in self.returnPath(['div','body'],self.dataDict):
for result in self.getFromDict(self.dataDict,path):
try:
if result['head']['#text'] == 'Bibliography':
for bib in result['listBibl']['biblStruct']:
#print(bib.keys())
retBib = {}
retBib['id'] = bib['@id']
retBib['pubtype'] = bib['@type']
try:
retBib['author'] = bib['analytic']['author']['surname'] + ',' + bib['analytic']['author']['forename']
retBib['titlea'] = bib['analytic']['title']['#text']
except:
pass
try:
retBib['author'] = bib['monogr']['author']['surname'] + ',' + bib['monogr']['author']['forename']
except:
pass
try:
retBib['editor'] = bib['monogr']['editor']['surname'] + ',' + bib['monogr']['editor']['forename']
except:
pass
try:
retBib['titlem'] = bib['monogr']['title']['#text']
except:
pass
for key in ['publisher','pubPlace','date']:
try:
retBib[key] = (bib['monogr']['imprint'][key])
except:
pass
try:
retBib['note'] = bib['monogr']['imprint']['note']['p']['#text']
except:
pass
try:
resTemp = bib['monogr']['imprint']['biblScope']
if type(resTemp)==list:
for dic in resTemp:
retBib[dic['@unit']] = dic['#text']
elif isinstance(resTemp,dict):
retBib[resTemp['@unit']] = resTemp['#text']
except:
pass
ret.append(retBib)
return ret
except:
pass
########################
#
# Helper functions to write to db
#
########################
def createAuthors(text,publication):
authorObjects = []
authors = text.authors()
if type(authors) == list:
for author in authornames:
if type(author) == str:
names = author.split(' ')
if len(names) > 2:
first = names[0]
middle = ' '.join(names[1:-1])
last = names[-1]
elif len(names) == 2:
first = names[0]
middle = ''
last = names[1]
elif len(names) == 1:
first = ''
middle = ''
last = names[0]
else:
raise CommandError(
'Cannot determine format for supplied name {0}'.format(names)
)
aTemp, created = Author.objects.update_or_create(
firstname=first,
lastname=last,
middlenames =middle,
)
aTemp.publications.add(publication)
authorObjects.append(aTemp)
return authorObjects
elif type(authors) == str:
names = authors.split(' ')
if len(names) == 3:
first = names[0]
middle = names[1]
last = names[2]
elif len(names) == 2:
first = names[0]
middle = ''
last = names[1]
elif len(names) == 1:
first = ''
middle = ''
last = names[0]
else:
raise CommandError(
'Cannot determine format for supplied name {0}'.format(names)
)
aTemp,created = Author.objects.update_or_create(
firstname=first,
lastname=last,
middlenames =middle
)
aTemp.publications.add(publication)
authorObjects.append(aTemp)
return authorObjects
else:
raise CommandError(
'Supplied author {0} has not the format "Firstname Middlename Lastname"'.format(authors)
)
def createPublication(text):
title = text.title()
pubTemp, created = Publication.objects.update_or_create(
series = title[0].lower(),
publication_id = title[1],
title = title[2],
subtitle = title[3],
publisher = text.publisher(),
published_date = text.published_date(),
pages=10,
price=0.0,
)
return pubTemp
def createChapters(text,publication):
chptList = []
for chapter in text.chapters():
chpTemp, created = Chapter.objects.update_or_create(
idstring = chapter[0],
order = chapter[1],
title = chapter[2],
publication = publication
)
chptList.append(chpTemp)
return chptList
def createSections(text,publication):
secList = []
for section in text.sections():
chapter = Chapter.objects.get(idstring=section[0])
secTemp, created = Section.objects.update_or_create(
chapter = chapter,
idstring = section[1],
order = section[2],
title = section[3],
publication = publication
)
secList.append(secTemp)
return secList
def createParagraphs(text,publication):
parList = []
for paragraph in text.paragraphs():
print(paragraph)
chapter = Chapter.objects.get(idstring=paragraph[0])
if paragraph[1]:
section = Section.objects.get(idstring=paragraph[1])
content = MixedContent.objects.update_or_create(
)
parTemp, created = Paragraph.objects.update_or_create(
chapter = chapter,
section = section,
idstring = paragraph[2],
#text = paragraph[3],
publication = publication
)
else:
parTemp, created = Paragraph.objects.update_or_create(
chapter = chapter,
idstring = paragraph[2],
#text = paragraph[3],
publication = publication
)
parList.append(parTemp)
return parList
def createCitations(text,publication):
citList = []
autList = []
for cit in text.biblio():
print(cit['author'])
try:
cit['author'].split(' ')
autTemp, created = Author.objects.update_or_create(
firstname = cit['author'].split(' ')[1],
lastname = cit['author'].split(' ')[-1],
middlename = cit['author'].split(' ')[1:-1],
)
except:
pass
cit.pop('author')
citTemp, created = Citation.objects.update_or_create(
**cit
)
try:
autTemp.citations.add(citTemp)
autList.append(autTemp)
except:
pass
citList.append(citTemp)
return citList, autList