Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
django-tei-importer/publications/management/commands/_importer_class.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
429 lines (404 sloc)
15.4 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from eoa.models import * | |
from collections import OrderedDict | |
#################### | |
# Importer class | |
#################### | |
class TEIimporter(): | |
""" | |
Importer for TEI documents to python objects. | |
""" | |
mapList = [] #search path for values | |
#dataDict = importieren # the document as an ordered dict | |
wanted_key = '' # search for "wanted_key" | |
basePath = ['TEI', 'text', 'body', 'div'] # path to find all text parts | |
def __init__(self, data): | |
self.dataDict = data | |
def returnPath(self,keys,dataDict): | |
res = [list(y) for y in [x for x in self.find_all_with_key(keys[0],dataDict)] if keys[1] in y] | |
return res | |
def getFromDict(self,dataDict,mapList): | |
# Traverse dict | |
for k in mapList: | |
dataDict = dataDict[k] | |
return dataDict | |
def find_all_with_key(self, wanted_key, dataDict, path=tuple()): | |
# return paths to key in nested dict | |
if isinstance(dataDict, list): | |
for idx, el in enumerate(dataDict): | |
yield from self.find_all_with_key(wanted_key, el, path+(idx,)) | |
elif isinstance(dataDict, dict): | |
for k in dataDict: | |
if k == wanted_key: | |
yield path +(k, ) | |
# you can add order of width-search by sorting result of tree.items() | |
for k, v in dataDict.items(): | |
yield from self.find_all_with_key(wanted_key, v, path+(k,)) | |
def title(self): | |
''' | |
Returning all information concerning the title fields. | |
Assumption is, that there is only one title in fileDesc and | |
the structur: | |
<title level="s" n="Number">Type</title> | |
<title type="main" level="m">Main-title</title> | |
is universal. Subtitle information is optional, but allways of | |
the form: | |
<title type="sub" level="m">Sub-title</title> | |
''' | |
paths = self.returnPath(['title','fileDesc'],self.dataDict) | |
res = self.getFromDict(self.dataDict,list(paths[0])) | |
serie = res[0]['#text'] | |
number = res[0]['@n'] | |
title = res[1]['#text'] | |
if len(res) == 3: | |
subtitle = res[2]['#text'] | |
else: | |
subtitle = '' | |
return (serie,number,title,subtitle) | |
def authors(self): | |
""" | |
Give all authors. | |
""" | |
paths = self.returnPath(['author','fileDesc'],self.dataDict) | |
res = self.getFromDict(self.dataDict,paths[0]) | |
return res | |
def publisher(self): | |
""" | |
Give all authors. | |
""" | |
paths = self.returnPath(['publisher','fileDesc'],self.dataDict) | |
res = self.getFromDict(self.dataDict,paths[0]) | |
return res | |
def published_date(self): | |
""" | |
Give all authors. | |
""" | |
paths = self.returnPath(['date','fileDesc'],self.dataDict) | |
res = self.getFromDict(self.dataDict,paths[0]) | |
return res['#text'] | |
def chapters(self): | |
""" | |
Chapters: Returned information is limited to id, counter and title | |
""" | |
res = self.getFromDict(self.dataDict,self.basePath) | |
ret = [] | |
if type(res) == list: | |
for x in res: | |
if x['@type'] == 'chapter': | |
_id = x['@id'] | |
_counter = x['@n'] | |
try: | |
_title = x['head']['#text'] | |
except: | |
_title = '' | |
ret.append((_id,_counter,_title)) | |
elif type(res) == OrderedDict: | |
x = res | |
if x['@type'] == 'chapter': | |
_id = x['@id'] | |
_counter = x['@n'] | |
try: | |
_title = x['head']['#text'] | |
except: | |
_title = '' | |
ret.append((_id,_counter,_title)) | |
return ret | |
def sections(self): | |
""" | |
Sections | |
""" | |
ret = [] | |
pathBase = self.basePath | |
chapters = self.chapters() | |
for i in range(len(chapters)): | |
heading = '' | |
_chapter = chapters[i][0] | |
if len(chapters) == 1: | |
pathTemp = pathBase + ['div'] | |
else: | |
pathTemp = pathBase + [i,'div'] | |
sections = self.getFromDict(self.dataDict,pathTemp) | |
#section_paths = self.returnPath(['div',i],self.dataDict) | |
if type(sections) == list: | |
for section in sections: | |
try: | |
heading = section['head']['#text'] | |
except: | |
pass | |
ret.append((_chapter,section['@id'],section['@n'],heading)) | |
elif type(sections) == OrderedDict: | |
try: | |
heading = sections['head']['#text'] | |
except: | |
pass | |
ret.append((_chapter,sections['@id'],sections['@n'],heading)) | |
return ret | |
def paragraphs(self): | |
""" | |
Paragraphs | |
""" | |
pathBase = self.basePath #['TEI', 'text', 'body', 'div'] | |
chapters = self.chapters() | |
sections = self.sections() | |
ret = [] | |
for i in range(len(chapters)): | |
_chapter = chapters[i][0] | |
try: | |
_section = '' | |
if len(chapters) == 1: | |
pathTemp = pathBase + ['p'] | |
else: | |
pathTemp = pathBase + [i,'p'] | |
paragraphs = self.getFromDict(self.dataDict, pathTemp) | |
if type(paragraphs) == list: | |
for paragraph in paragraphs: | |
ret.append((_chapter,_section,paragraph['@id'],paragraph['#text'])) | |
elif type(paragraphs) == OrderedDict: | |
ret.append((_chapter,_section,paragraphs['@id'],paragraphs['#text'])) | |
else: | |
print('Error reading:',_chapter,type(paragraphs),pathTemp) | |
pass | |
except: | |
pass | |
sectionNumber = len([x for x in sections if x[0] == _chapter]) | |
#print(sectionNumber) | |
if sectionNumber == 1: | |
_section = sections[i][1] | |
if len(chapters) == 1: | |
pathTemp = pathBase + ['div','p'] | |
else: | |
pathTemp = pathBase + [i,'div','p'] | |
#pathTemp = pathBase + [i,'div','p'] | |
paragraphs = self.getFromDict(self.dataDict, pathTemp) | |
if type(paragraphs) == list: | |
for paragraph in paragraphs: | |
ret.append((_chapter,_section,paragraph['@id'],paragraph['#text'])) | |
elif type(paragraphs) == OrderedDict: | |
ret.append((_chapter,_section,paragraphs['@id'],paragraphs['#text'])) | |
else: | |
print('Error reading:',_chapter,type(paragraphs),pathTemp) | |
pass | |
elif sectionNumber > 1: | |
for k in range(sectionNumber): | |
_section = sections[k][1] | |
if len(chapters) == 1: | |
pathTemp = pathBase + ['div',k,'p'] | |
else: | |
pathTemp = pathBase + [i,'div',k,'p'] | |
paragraphs = self.getFromDict(self.dataDict, pathTemp) | |
#print(paragraphs) | |
if type(paragraphs) == list: | |
for paragraph in paragraphs: | |
#print(paragraph) | |
ret.append((_chapter,_section,paragraph['@id'],paragraph['#text'])) | |
elif type(paragraphs) == OrderedDict: | |
#print(paragraphs.keys()) | |
ret.append((_chapter,_section,paragraphs['@id'],paragraphs['#text'])) | |
else: | |
print('Error reading:',_chapter,type(paragraphs),pathTemp) | |
pass | |
else: | |
pass | |
return ret | |
def biblio(self): | |
ret = [] | |
for path in self.returnPath(['div','body'],self.dataDict): | |
for result in self.getFromDict(self.dataDict,path): | |
try: | |
if result['head']['#text'] == 'Bibliography': | |
for bib in result['listBibl']['biblStruct']: | |
#print(bib.keys()) | |
retBib = {} | |
retBib['id'] = bib['@id'] | |
retBib['pubtype'] = bib['@type'] | |
try: | |
retBib['author'] = bib['analytic']['author']['surname'] + ',' + bib['analytic']['author']['forename'] | |
retBib['titlea'] = bib['analytic']['title']['#text'] | |
except: | |
pass | |
try: | |
retBib['author'] = bib['monogr']['author']['surname'] + ',' + bib['monogr']['author']['forename'] | |
except: | |
pass | |
try: | |
retBib['editor'] = bib['monogr']['editor']['surname'] + ',' + bib['monogr']['editor']['forename'] | |
except: | |
pass | |
try: | |
retBib['titlem'] = bib['monogr']['title']['#text'] | |
except: | |
pass | |
for key in ['publisher','pubPlace','date']: | |
try: | |
retBib[key] = (bib['monogr']['imprint'][key]) | |
except: | |
pass | |
try: | |
retBib['note'] = bib['monogr']['imprint']['note']['p']['#text'] | |
except: | |
pass | |
try: | |
resTemp = bib['monogr']['imprint']['biblScope'] | |
if type(resTemp)==list: | |
for dic in resTemp: | |
retBib[dic['@unit']] = dic['#text'] | |
elif isinstance(resTemp,dict): | |
retBib[resTemp['@unit']] = resTemp['#text'] | |
except: | |
pass | |
ret.append(retBib) | |
return ret | |
except: | |
pass | |
######################## | |
# | |
# Helper functions to write to db | |
# | |
######################## | |
def createAuthors(text,publication): | |
authorObjects = [] | |
authors = text.authors() | |
if type(authors) == list: | |
for author in authornames: | |
if type(author) == str: | |
names = author.split(' ') | |
if len(names) > 2: | |
first = names[0] | |
middle = ' '.join(names[1:-1]) | |
last = names[-1] | |
elif len(names) == 2: | |
first = names[0] | |
middle = '' | |
last = names[1] | |
elif len(names) == 1: | |
first = '' | |
middle = '' | |
last = names[0] | |
else: | |
raise CommandError( | |
'Cannot determine format for supplied name {0}'.format(names) | |
) | |
aTemp, created = Author.objects.update_or_create( | |
firstname=first, | |
lastname=last, | |
middlenames =middle, | |
) | |
aTemp.publications.add(publication) | |
authorObjects.append(aTemp) | |
return authorObjects | |
elif type(authors) == str: | |
names = authors.split(' ') | |
if len(names) == 3: | |
first = names[0] | |
middle = names[1] | |
last = names[2] | |
elif len(names) == 2: | |
first = names[0] | |
middle = '' | |
last = names[1] | |
elif len(names) == 1: | |
first = '' | |
middle = '' | |
last = names[0] | |
else: | |
raise CommandError( | |
'Cannot determine format for supplied name {0}'.format(names) | |
) | |
aTemp,created = Author.objects.update_or_create( | |
firstname=first, | |
lastname=last, | |
middlenames =middle | |
) | |
aTemp.publications.add(publication) | |
authorObjects.append(aTemp) | |
return authorObjects | |
else: | |
raise CommandError( | |
'Supplied author {0} has not the format "Firstname Middlename Lastname"'.format(authors) | |
) | |
def createPublication(text): | |
title = text.title() | |
pubTemp, created = Publication.objects.update_or_create( | |
series = title[0].lower(), | |
publication_id = title[1], | |
title = title[2], | |
subtitle = title[3], | |
publisher = text.publisher(), | |
published_date = text.published_date(), | |
pages=10, | |
price=0.0, | |
) | |
return pubTemp | |
def createChapters(text,publication): | |
chptList = [] | |
for chapter in text.chapters(): | |
chpTemp, created = Chapter.objects.update_or_create( | |
idstring = chapter[0], | |
order = chapter[1], | |
title = chapter[2], | |
publication = publication | |
) | |
chptList.append(chpTemp) | |
return chptList | |
def createSections(text,publication): | |
secList = [] | |
for section in text.sections(): | |
chapter = Chapter.objects.get(idstring=section[0]) | |
secTemp, created = Section.objects.update_or_create( | |
chapter = chapter, | |
idstring = section[1], | |
order = section[2], | |
title = section[3], | |
publication = publication | |
) | |
secList.append(secTemp) | |
return secList | |
def createParagraphs(text,publication): | |
parList = [] | |
for paragraph in text.paragraphs(): | |
print(paragraph) | |
chapter = Chapter.objects.get(idstring=paragraph[0]) | |
if paragraph[1]: | |
section = Section.objects.get(idstring=paragraph[1]) | |
content = MixedContent.objects.update_or_create( | |
) | |
parTemp, created = Paragraph.objects.update_or_create( | |
chapter = chapter, | |
section = section, | |
idstring = paragraph[2], | |
#text = paragraph[3], | |
publication = publication | |
) | |
else: | |
parTemp, created = Paragraph.objects.update_or_create( | |
chapter = chapter, | |
idstring = paragraph[2], | |
#text = paragraph[3], | |
publication = publication | |
) | |
parList.append(parTemp) | |
return parList | |
def createCitations(text,publication): | |
citList = [] | |
autList = [] | |
for cit in text.biblio(): | |
print(cit['author']) | |
try: | |
cit['author'].split(' ') | |
autTemp, created = Author.objects.update_or_create( | |
firstname = cit['author'].split(' ')[1], | |
lastname = cit['author'].split(' ')[-1], | |
middlename = cit['author'].split(' ')[1:-1], | |
) | |
except: | |
pass | |
cit.pop('author') | |
citTemp, created = Citation.objects.update_or_create( | |
**cit | |
) | |
try: | |
autTemp.citations.add(citTemp) | |
autList.append(autTemp) | |
except: | |
pass | |
citList.append(citTemp) | |
return citList, autList |