_importer_class.py

from eoa.models import *
from collections import OrderedDict


####################
# Importer class
####################

class TEIimporter():
    """
    Importer for TEI documents to python objects.
    """
    mapList = []          #search path for values
    #dataDict = importieren   # the document as an ordered dict
    wanted_key = ''       # search for "wanted_key"
    basePath = ['TEI', 'text', 'body', 'div'] # path to find all text parts


    def __init__(self, data):
        self.dataDict = data

    def returnPath(self,keys,dataDict):
        res = [list(y) for y in [x for x in self.find_all_with_key(keys[0],dataDict)] if keys[1] in y]
        return res

    def getFromDict(self,dataDict,mapList):
        # Traverse dict
        for k in mapList:
            dataDict = dataDict[k]
        return dataDict

    def find_all_with_key(self, wanted_key, dataDict, path=tuple()):
        # return paths to key in nested dict
        if isinstance(dataDict, list):
            for idx, el in enumerate(dataDict):
                yield from self.find_all_with_key(wanted_key, el, path+(idx,))
        elif isinstance(dataDict, dict):
            for k in dataDict:
                if k == wanted_key:
                    yield path +(k, )
            # you can add order of width-search by sorting result of tree.items()
            for k, v in dataDict.items():
                yield from self.find_all_with_key(wanted_key, v, path+(k,))

    def title(self):
        '''
        Returning all information concerning the title fields.
        Assumption is, that there is only one title in fileDesc and
        the structur:
        <title level="s" n="Number">Type</title>
        <title type="main" level="m">Main-title</title>
        is universal. Subtitle information is optional, but allways of
        the form:
        <title type="sub" level="m">Sub-title</title>
        '''
        paths = self.returnPath(['title','fileDesc'],self.dataDict)
        res = self.getFromDict(self.dataDict,list(paths[0]))
        serie = res[0]['#text']
        number = res[0]['@n']
        title =  res[1]['#text']
        if len(res) == 3:
            subtitle = res[2]['#text']
        else:
            subtitle = ''
        return (serie,number,title,subtitle)

    def authors(self):
        """
        Give all authors.
        """
        paths = self.returnPath(['author','fileDesc'],self.dataDict)
        res = self.getFromDict(self.dataDict,paths[0])
        return res

    def publisher(self):
        """
        Give all authors.
        """
        paths = self.returnPath(['publisher','fileDesc'],self.dataDict)
        res = self.getFromDict(self.dataDict,paths[0])
        return res

    def published_date(self):
        """
        Give all authors.
        """
        paths =  self.returnPath(['date','fileDesc'],self.dataDict)
        res = self.getFromDict(self.dataDict,paths[0])
        return res['#text']

    def chapters(self):
        """
        Chapters: Returned information is limited to id, counter and title
        """
        res = self.getFromDict(self.dataDict,self.basePath)
        ret = []
        if type(res) == list:
            for x in res:
                if x['@type'] == 'chapter':
                    _id = x['@id']
                    _counter = x['@n']
                    try:
                        _title = x['head']['#text']
                    except:
                        _title = ''
                    ret.append((_id,_counter,_title))
        elif type(res) == OrderedDict:
            x = res
            if x['@type'] == 'chapter':
                _id = x['@id']
                _counter = x['@n']
                try:
                    _title = x['head']['#text']
                except:
                    _title = ''
                ret.append((_id,_counter,_title))
        return ret

    def sections(self):
        """
        Sections
        """
        ret = []
        pathBase = self.basePath
        chapters = self.chapters()
        for i in range(len(chapters)):
            heading = ''
            _chapter = chapters[i][0]
            if len(chapters) == 1:
                pathTemp = pathBase + ['div']
            else:
                pathTemp = pathBase + [i,'div']
            sections = self.getFromDict(self.dataDict,pathTemp)
            #section_paths = self.returnPath(['div',i],self.dataDict)
            if type(sections) == list:
                for section in sections:
                    try:
                        heading = section['head']['#text']
                    except:
                        pass
                    ret.append((_chapter,section['@id'],section['@n'],heading))
            elif type(sections) == OrderedDict:
                try:
                    heading = sections['head']['#text']
                except:
                    pass
                ret.append((_chapter,sections['@id'],sections['@n'],heading))
        return ret

    def paragraphs(self):
        """
        Paragraphs
        """
        pathBase = self.basePath #['TEI', 'text', 'body', 'div']
        chapters = self.chapters()
        sections = self.sections()
        ret = []
        for i in range(len(chapters)):
            _chapter = chapters[i][0]
            try:
                _section = ''
                if len(chapters) == 1:
                    pathTemp = pathBase + ['p']
                else:
                    pathTemp = pathBase + [i,'p']
                paragraphs = self.getFromDict(self.dataDict, pathTemp)
                if type(paragraphs) == list:
                    for paragraph in paragraphs:
                        ret.append((_chapter,_section,paragraph['@id'],paragraph['#text']))
                elif type(paragraphs) == OrderedDict:
                    ret.append((_chapter,_section,paragraphs['@id'],paragraphs['#text']))
                else:
                    print('Error reading:',_chapter,type(paragraphs),pathTemp)
                    pass
            except:
                pass
            sectionNumber = len([x for x in sections if x[0] == _chapter])
            #print(sectionNumber)
            if sectionNumber == 1:
                _section = sections[i][1]
                if len(chapters) == 1:
                    pathTemp = pathBase + ['div','p']
                else:
                    pathTemp = pathBase + [i,'div','p']
                #pathTemp = pathBase + [i,'div','p']
                paragraphs = self.getFromDict(self.dataDict, pathTemp)
                if type(paragraphs) == list:
                    for paragraph in paragraphs:
                        ret.append((_chapter,_section,paragraph['@id'],paragraph['#text']))
                elif type(paragraphs) == OrderedDict:
                    ret.append((_chapter,_section,paragraphs['@id'],paragraphs['#text']))
                else:
                    print('Error reading:',_chapter,type(paragraphs),pathTemp)
                    pass
            elif sectionNumber > 1:
                for k in range(sectionNumber):
                    _section = sections[k][1]
                    if len(chapters) == 1:
                        pathTemp = pathBase + ['div',k,'p']
                    else:
                        pathTemp = pathBase + [i,'div',k,'p']
                    paragraphs = self.getFromDict(self.dataDict, pathTemp)
                    #print(paragraphs)
                    if type(paragraphs) == list:
                        for paragraph in paragraphs:
                            #print(paragraph)
                            ret.append((_chapter,_section,paragraph['@id'],paragraph['#text']))
                    elif type(paragraphs) == OrderedDict:
                        #print(paragraphs.keys())
                        ret.append((_chapter,_section,paragraphs['@id'],paragraphs['#text']))
                    else:
                        print('Error reading:',_chapter,type(paragraphs),pathTemp)
                        pass
            else:
                pass
        return ret

    def biblio(self):
        ret = []
        for path in self.returnPath(['div','body'],self.dataDict):
            for result in self.getFromDict(self.dataDict,path):
                try:
                    if result['head']['#text'] == 'Bibliography':
                        for bib in result['listBibl']['biblStruct']:
                            #print(bib.keys())
                            retBib = {}
                            retBib['id'] = bib['@id']
                            retBib['pubtype'] = bib['@type']
                            try:
                                retBib['author'] = bib['analytic']['author']['surname'] + ',' + bib['analytic']['author']['forename']
                                retBib['titlea'] = bib['analytic']['title']['#text']
                            except:
                                pass
                            try:
                                retBib['author'] = bib['monogr']['author']['surname'] + ',' + bib['monogr']['author']['forename']
                            except:
                                pass
                            try:
                                retBib['editor'] = bib['monogr']['editor']['surname'] + ',' + bib['monogr']['editor']['forename']
                            except:
                                pass
                            try:
                                retBib['titlem'] = bib['monogr']['title']['#text']
                            except:
                                pass
                            for key in ['publisher','pubPlace','date']:
                                try:
                                    retBib[key] = (bib['monogr']['imprint'][key])
                                except:
                                    pass
                            try:
                                retBib['note'] = bib['monogr']['imprint']['note']['p']['#text']
                            except:
                                pass
                            try:
                                resTemp = bib['monogr']['imprint']['biblScope']
                                if type(resTemp)==list:
                                    for dic in resTemp:
                                        retBib[dic['@unit']] = dic['#text']
                                elif isinstance(resTemp,dict):
                                    retBib[resTemp['@unit']] = resTemp['#text']
                            except:
                                pass
                            ret.append(retBib)
                        return ret
                except:
                    pass

########################
#
# Helper functions to write to db
#
########################


def createAuthors(text,publication):
    authorObjects = []
    authors = text.authors()
    if type(authors) == list:
        for author in authornames:
            if type(author) == str:
                names = author.split(' ')
                if len(names) > 2:
                    first = names[0]
                    middle = ' '.join(names[1:-1])
                    last = names[-1]
                elif len(names) == 2:
                    first = names[0]
                    middle = ''
                    last = names[1]
                elif len(names) == 1:
                    first = ''
                    middle = ''
                    last = names[0]
                else:
                    raise CommandError(
                        'Cannot determine format for supplied name {0}'.format(names)
                        )
                aTemp, created = Author.objects.update_or_create(
                    firstname=first,
                    lastname=last,
                    middlenames =middle,
                    )
                aTemp.publications.add(publication)
                authorObjects.append(aTemp)
        return authorObjects
    elif type(authors) == str:
        names = authors.split(' ')
        if len(names) == 3:
            first = names[0]
            middle = names[1]
            last = names[2]
        elif len(names) == 2:
            first = names[0]
            middle = ''
            last = names[1]
        elif len(names) == 1:
            first = ''
            middle = ''
            last = names[0]
        else:
            raise CommandError(
                'Cannot determine format for supplied name {0}'.format(names)
                )
        aTemp,created = Author.objects.update_or_create(
        firstname=first,
        lastname=last,
        middlenames =middle
        )
        aTemp.publications.add(publication)
        authorObjects.append(aTemp)
        return authorObjects
    else:
         raise CommandError(
             'Supplied author {0} has not the format "Firstname Middlename Lastname"'.format(authors)
             )

def createPublication(text):
    title = text.title()
    pubTemp, created  = Publication.objects.update_or_create(
        series = title[0].lower(),
        publication_id = title[1],
        title = title[2],
        subtitle = title[3],
        publisher = text.publisher(),
        published_date = text.published_date(),
        pages=10,
        price=0.0,
    )
    return pubTemp

def createChapters(text,publication):
    chptList = []
    for chapter in text.chapters():
        chpTemp, created = Chapter.objects.update_or_create(
            idstring = chapter[0],
            order = chapter[1],
            title = chapter[2],
            publication = publication
        )
        chptList.append(chpTemp)
    return chptList

def createSections(text,publication):
    secList = []
    for section in text.sections():
        chapter = Chapter.objects.get(idstring=section[0])
        secTemp, created = Section.objects.update_or_create(
            chapter = chapter,
            idstring = section[1],
            order = section[2],
            title = section[3],
            publication = publication
        )
        secList.append(secTemp)
    return secList

def createParagraphs(text,publication):
    parList = []
    for paragraph in text.paragraphs():
        print(paragraph)
        chapter = Chapter.objects.get(idstring=paragraph[0])
        if paragraph[1]:
            section = Section.objects.get(idstring=paragraph[1])
            content = MixedContent.objects.update_or_create(

            )
            parTemp, created = Paragraph.objects.update_or_create(
                chapter = chapter,
                section = section,
                idstring = paragraph[2],
                #text = paragraph[3],
                publication = publication
            )
        else:
            parTemp, created = Paragraph.objects.update_or_create(
                chapter = chapter,
                idstring = paragraph[2],
                #text = paragraph[3],
                publication = publication
            )
        parList.append(parTemp)
    return parList

def createCitations(text,publication):
    citList = []
    autList = []
    for cit in text.biblio():
        print(cit['author'])
        try:
            cit['author'].split(' ')
            autTemp, created = Author.objects.update_or_create(
                firstname = cit['author'].split(' ')[1],
                lastname = cit['author'].split(' ')[-1],
                middlename = cit['author'].split(' ')[1:-1],
            )
        except:
            pass
        cit.pop('author')
        citTemp, created = Citation.objects.update_or_create(
            **cit
            )
        try:
            autTemp.citations.add(citTemp)
            autList.append(autTemp)
        except:
            pass
        citList.append(citTemp)
    return citList, autList
	from eoa.models import *
	from collections import OrderedDict


	####################
	# Importer class
	####################

	class TEIimporter():
	"""
	Importer for TEI documents to python objects.
	"""
	mapList = [] #search path for values
	#dataDict = importieren # the document as an ordered dict
	wanted_key = '' # search for "wanted_key"
	basePath = ['TEI', 'text', 'body', 'div'] # path to find all text parts


	def __init__(self, data):
	self.dataDict = data

	def returnPath(self,keys,dataDict):
	res = [list(y) for y in [x for x in self.find_all_with_key(keys[0],dataDict)] if keys[1] in y]
	return res

	def getFromDict(self,dataDict,mapList):
	# Traverse dict
	for k in mapList:
	dataDict = dataDict[k]
	return dataDict

	def find_all_with_key(self, wanted_key, dataDict, path=tuple()):
	# return paths to key in nested dict
	if isinstance(dataDict, list):
	for idx, el in enumerate(dataDict):
	yield from self.find_all_with_key(wanted_key, el, path+(idx,))
	elif isinstance(dataDict, dict):
	for k in dataDict:
	if k == wanted_key:
	yield path +(k, )
	# you can add order of width-search by sorting result of tree.items()
	for k, v in dataDict.items():
	yield from self.find_all_with_key(wanted_key, v, path+(k,))

	def title(self):
	'''
	Returning all information concerning the title fields.
	Assumption is, that there is only one title in fileDesc and
	the structur:
	<title level="s" n="Number">Type</title>
	<title type="main" level="m">Main-title</title>
	is universal. Subtitle information is optional, but allways of
	the form:
	<title type="sub" level="m">Sub-title</title>
	'''
	paths = self.returnPath(['title','fileDesc'],self.dataDict)
	res = self.getFromDict(self.dataDict,list(paths[0]))
	serie = res[0]['#text']
	number = res[0]['@n']
	title = res[1]['#text']
	if len(res) == 3:
	subtitle = res[2]['#text']
	else:
	subtitle = ''
	return (serie,number,title,subtitle)

	def authors(self):
	"""
	Give all authors.
	"""
	paths = self.returnPath(['author','fileDesc'],self.dataDict)
	res = self.getFromDict(self.dataDict,paths[0])
	return res

	def publisher(self):
	"""
	Give all authors.
	"""
	paths = self.returnPath(['publisher','fileDesc'],self.dataDict)
	res = self.getFromDict(self.dataDict,paths[0])
	return res

	def published_date(self):
	"""
	Give all authors.
	"""
	paths = self.returnPath(['date','fileDesc'],self.dataDict)
	res = self.getFromDict(self.dataDict,paths[0])
	return res['#text']

	def chapters(self):
	"""
	Chapters: Returned information is limited to id, counter and title
	"""
	res = self.getFromDict(self.dataDict,self.basePath)
	ret = []
	if type(res) == list:
	for x in res:
	if x['@type'] == 'chapter':
	_id = x['@id']
	_counter = x['@n']
	try:
	_title = x['head']['#text']
	except:
	_title = ''
	ret.append((_id,_counter,_title))
	elif type(res) == OrderedDict:
	x = res
	if x['@type'] == 'chapter':
	_id = x['@id']
	_counter = x['@n']
	try:
	_title = x['head']['#text']
	except:
	_title = ''
	ret.append((_id,_counter,_title))
	return ret

	def sections(self):
	"""
	Sections
	"""
	ret = []
	pathBase = self.basePath
	chapters = self.chapters()
	for i in range(len(chapters)):
	heading = ''
	_chapter = chapters[i][0]
	if len(chapters) == 1:
	pathTemp = pathBase + ['div']
	else:
	pathTemp = pathBase + [i,'div']
	sections = self.getFromDict(self.dataDict,pathTemp)
	#section_paths = self.returnPath(['div',i],self.dataDict)
	if type(sections) == list:
	for section in sections:
	try:
	heading = section['head']['#text']
	except:
	pass
	ret.append((_chapter,section['@id'],section['@n'],heading))
	elif type(sections) == OrderedDict:
	try:
	heading = sections['head']['#text']
	except:
	pass
	ret.append((_chapter,sections['@id'],sections['@n'],heading))
	return ret

	def paragraphs(self):
	"""
	Paragraphs
	"""
	pathBase = self.basePath #['TEI', 'text', 'body', 'div']
	chapters = self.chapters()
	sections = self.sections()
	ret = []
	for i in range(len(chapters)):
	_chapter = chapters[i][0]
	try:
	_section = ''
	if len(chapters) == 1:
	pathTemp = pathBase + ['p']
	else:
	pathTemp = pathBase + [i,'p']
	paragraphs = self.getFromDict(self.dataDict, pathTemp)
	if type(paragraphs) == list:
	for paragraph in paragraphs:
	ret.append((_chapter,_section,paragraph['@id'],paragraph['#text']))
	elif type(paragraphs) == OrderedDict:
	ret.append((_chapter,_section,paragraphs['@id'],paragraphs['#text']))
	else:
	print('Error reading:',_chapter,type(paragraphs),pathTemp)
	pass
	except:
	pass
	sectionNumber = len([x for x in sections if x[0] == _chapter])
	#print(sectionNumber)
	if sectionNumber == 1:
	_section = sections[i][1]
	if len(chapters) == 1:
	pathTemp = pathBase + ['div','p']
	else:
	pathTemp = pathBase + [i,'div','p']
	#pathTemp = pathBase + [i,'div','p']
	paragraphs = self.getFromDict(self.dataDict, pathTemp)
	if type(paragraphs) == list:
	for paragraph in paragraphs:
	ret.append((_chapter,_section,paragraph['@id'],paragraph['#text']))
	elif type(paragraphs) == OrderedDict:
	ret.append((_chapter,_section,paragraphs['@id'],paragraphs['#text']))
	else:
	print('Error reading:',_chapter,type(paragraphs),pathTemp)
	pass
	elif sectionNumber > 1:
	for k in range(sectionNumber):
	_section = sections[k][1]
	if len(chapters) == 1:
	pathTemp = pathBase + ['div',k,'p']
	else:
	pathTemp = pathBase + [i,'div',k,'p']
	paragraphs = self.getFromDict(self.dataDict, pathTemp)
	#print(paragraphs)
	if type(paragraphs) == list:
	for paragraph in paragraphs:
	#print(paragraph)
	ret.append((_chapter,_section,paragraph['@id'],paragraph['#text']))
	elif type(paragraphs) == OrderedDict:
	#print(paragraphs.keys())
	ret.append((_chapter,_section,paragraphs['@id'],paragraphs['#text']))
	else:
	print('Error reading:',_chapter,type(paragraphs),pathTemp)
	pass
	else:
	pass
	return ret

	def biblio(self):
	ret = []
	for path in self.returnPath(['div','body'],self.dataDict):
	for result in self.getFromDict(self.dataDict,path):
	try:
	if result['head']['#text'] == 'Bibliography':
	for bib in result['listBibl']['biblStruct']:
	#print(bib.keys())
	retBib = {}
	retBib['id'] = bib['@id']
	retBib['pubtype'] = bib['@type']
	try:
	retBib['author'] = bib['analytic']['author']['surname'] + ',' + bib['analytic']['author']['forename']
	retBib['titlea'] = bib['analytic']['title']['#text']
	except:
	pass
	try:
	retBib['author'] = bib['monogr']['author']['surname'] + ',' + bib['monogr']['author']['forename']
	except:
	pass
	try:
	retBib['editor'] = bib['monogr']['editor']['surname'] + ',' + bib['monogr']['editor']['forename']
	except:
	pass
	try:
	retBib['titlem'] = bib['monogr']['title']['#text']
	except:
	pass
	for key in ['publisher','pubPlace','date']:
	try:
	retBib[key] = (bib['monogr']['imprint'][key])
	except:
	pass
	try:
	retBib['note'] = bib['monogr']['imprint']['note']['p']['#text']
	except:
	pass
	try:
	resTemp = bib['monogr']['imprint']['biblScope']
	if type(resTemp)==list:
	for dic in resTemp:
	retBib[dic['@unit']] = dic['#text']
	elif isinstance(resTemp,dict):
	retBib[resTemp['@unit']] = resTemp['#text']
	except:
	pass
	ret.append(retBib)
	return ret
	except:
	pass

	########################
	#
	# Helper functions to write to db
	#
	########################


	def createAuthors(text,publication):
	authorObjects = []
	authors = text.authors()
	if type(authors) == list:
	for author in authornames:
	if type(author) == str:
	names = author.split(' ')
	if len(names) > 2:
	first = names[0]
	middle = ' '.join(names[1:-1])
	last = names[-1]
	elif len(names) == 2:
	first = names[0]
	middle = ''
	last = names[1]
	elif len(names) == 1:
	first = ''
	middle = ''
	last = names[0]
	else:
	raise CommandError(
	'Cannot determine format for supplied name {0}'.format(names)
	)
	aTemp, created = Author.objects.update_or_create(
	firstname=first,
	lastname=last,
	middlenames =middle,
	)
	aTemp.publications.add(publication)
	authorObjects.append(aTemp)
	return authorObjects
	elif type(authors) == str:
	names = authors.split(' ')
	if len(names) == 3:
	first = names[0]
	middle = names[1]
	last = names[2]
	elif len(names) == 2:
	first = names[0]
	middle = ''
	last = names[1]
	elif len(names) == 1:
	first = ''
	middle = ''
	last = names[0]
	else:
	raise CommandError(
	'Cannot determine format for supplied name {0}'.format(names)
	)
	aTemp,created = Author.objects.update_or_create(
	firstname=first,
	lastname=last,
	middlenames =middle
	)
	aTemp.publications.add(publication)
	authorObjects.append(aTemp)
	return authorObjects
	else:
	raise CommandError(
	'Supplied author {0} has not the format "Firstname Middlename Lastname"'.format(authors)
	)

	def createPublication(text):
	title = text.title()
	pubTemp, created = Publication.objects.update_or_create(
	series = title[0].lower(),
	publication_id = title[1],
	title = title[2],
	subtitle = title[3],
	publisher = text.publisher(),
	published_date = text.published_date(),
	pages=10,
	price=0.0,
	)
	return pubTemp

	def createChapters(text,publication):
	chptList = []
	for chapter in text.chapters():
	chpTemp, created = Chapter.objects.update_or_create(
	idstring = chapter[0],
	order = chapter[1],
	title = chapter[2],
	publication = publication
	)
	chptList.append(chpTemp)
	return chptList

	def createSections(text,publication):
	secList = []
	for section in text.sections():
	chapter = Chapter.objects.get(idstring=section[0])
	secTemp, created = Section.objects.update_or_create(
	chapter = chapter,
	idstring = section[1],
	order = section[2],
	title = section[3],
	publication = publication
	)
	secList.append(secTemp)
	return secList

	def createParagraphs(text,publication):
	parList = []
	for paragraph in text.paragraphs():
	print(paragraph)
	chapter = Chapter.objects.get(idstring=paragraph[0])
	if paragraph[1]:
	section = Section.objects.get(idstring=paragraph[1])
	content = MixedContent.objects.update_or_create(

	)
	parTemp, created = Paragraph.objects.update_or_create(
	chapter = chapter,
	section = section,
	idstring = paragraph[2],
	#text = paragraph[3],
	publication = publication
	)
	else:
	parTemp, created = Paragraph.objects.update_or_create(
	chapter = chapter,
	idstring = paragraph[2],
	#text = paragraph[3],
	publication = publication
	)
	parList.append(parTemp)
	return parList

	def createCitations(text,publication):
	citList = []
	autList = []
	for cit in text.biblio():
	print(cit['author'])
	try:
	cit['author'].split(' ')
	autTemp, created = Author.objects.update_or_create(
	firstname = cit['author'].split(' ')[1],
	lastname = cit['author'].split(' ')[-1],
	middlename = cit['author'].split(' ')[1:-1],
	)
	except:
	pass
	cit.pop('author')
	citTemp, created = Citation.objects.update_or_create(
	**cit
	)
	try:
	autTemp.citations.add(citTemp)
	autList.append(autTemp)
	except:
	pass
	citList.append(citTemp)
	return citList, autList