_importer_class.py

from eoa.models import *
from collections import OrderedDict
from bs4 import BeautifulSoup
import re


####################
# Importer class
####################
class TEIimporter():
    """
    Importer for TEI documents to python objects.
    """

    def __init__(self, data):
        self.soup = BeautifulSoup(open(data), features="xml")
        self.header = self.soup.findAll('teiHeader')[0]
        self.body = self.soup.findAll('body')[0]

    def publication(self):
        res = {}
        seriesInfo = [(x.attrs,x.contents[0]) for x in self.header.findAll('title',{'level':'s'})]
        res['title'] = [x.contents[0] for x in self.header.findAll('title',{'type':'main'})][0]
        try:
            res['subtitle'] = [x.contents[0] for x in self.header.findAll('title',{'type':'sub'})][0]
        except:
            pass
        res['series'] = seriesInfo[0][1].lower()
        res['publication_id'] = int(seriesInfo[0][0]['n'])
        #
        res['publisher'] = self.header.findAll('publisher')[0].contents[0]
        res['created_date'] = self.header.findAll('date')[0].contents[0]
        res['pages'] = 1
        res['price'] = 1
        return res

    def authors(self):
        resTemp = [x.contents[0] for x in self.header.findAll('author')]
        auts = [x.split(' ') for x in resTemp]
        ret = []
        for aut in auts:
            au = {}
            if len(aut) > 2:
                au['firstname'] = aut[0]
                au['lastname'] = aut[-1]
                au['middlenames'] = ' '.join(aut[1:-1])
            elif len(aut) == 2:
                au['firstname'] = aut[0]
                au['lastname'] = aut[1]
            elif len(aut) == 1:
                au['lastname'] = aut[0]
            ret.append(au)
        return ret

    def getContent(self,obj):
        conList = []
        for k in range(len(obj)):
            try:
                parent = obj[k].parent.attrs['xml:id']
            except:
                #print(obj[k])
                pass
            for i, con in enumerate(obj[k].contents):
                temp = {}
                temp['parentid'] = parent
                try:
                    temp['position'] = i
                    temp['paragraphid'] = con.parent.attrs['xml:id']
                except:
                    pass
                if type(con).__name__ == 'Tag':
                    temp['tagname'] = re.findall('(?<=<)\w+(?=\s)',con.decode())[0]
                    temp['attributes'] = con.attrs
                    temp['text'] = self.getContent([con])
                elif type(con).__name__ == 'NavigableString':
                    temp['text'] = str(con).strip()
                conList.append(temp)
        return conList

    def findPart(self,part,keyname):
        ret = []
        parts = self.body.findAll('div',{'type':part})
        for part in parts:
            pt = {}
            titles = part.findAll('head')
            for title in titles:
                if title.parent.attrs['xml:id'] == part.attrs['xml:id']:
                    pt['title'] = title.contents
            pt['idstring'] = part.attrs['xml:id']
            try:
                pt['order'] = part.attrs['n']
            except:
                pass
            try:
                pt['number'] = part.attrs['o']
            except:
                pass
            try:
                pt[keyname] = part.parent.attrs['xml:id']
            except:
                pt[keyname] = ''
                pass
            ret.append(pt)
        return ret

    def chapters(self):
        return self.findPart('chapter','part')

    def sections(self):
        return self.findPart('section','chapter')

    def subsections(self):
        return self.findPart('subsection','section')

    def subsubsections(self):
        return self.findPart('subsubsection','subsection')

    def paragraphs(self):
        paras = self.body.findAll('p')
        return self.getContent(paras)

    def citations(self):
        res = []
        for bib in self.body.findAll('biblStruct'):
            tempB = {}
            auth = []

            for author in bib.findAll('author'):
                tempA = {}
                tempA['lastname'] = author.findAll('surname')[0].contents[0]
                tempA['firstname'] = author.findAll('forename')[0].contents[0]
                auth.append(tempA)

            edit = []
            for editor in bib.findAll('editor'):
                tempA = {}
                tempA['lastname'] = editor.findAll('surname')[0].contents[0]
                tempA['firstname'] = editor.findAll('forename')[0].contents[0]
                edit.append(tempA)

            try:
                tempB['pubPlace'] = bib.findAll('pubPlace')[0].contents[0]
            except:
                pass
            try:
                tempB['publisher'] = bib.findAll('publisher')[0].contents[0]
            except:
                pass
            tempB['pubtype'] = bib.attrs['type']
            tempB['idstring'] = bib.attrs['xml:id']
            tempB['date'] = bib.findAll('date')[0].contents[0]
            notes = ''
            for notepart in bib.findAll('note'):
                notes += notepart.text.strip()
            tempB['note'] = notes
            for title in bib.findAll('title'):
                if title.attrs['level'] == 'a':
                    tempB['titlea'] = title.contents[0]
                elif title.attrs['level'] in ['m','j']:
                    tempB['titlem'] = title.contents[0]
            for scope in bib.findAll('biblScope'):
                tempB[scope.attrs['unit']] = scope.contents[0]

            res.append((('authors',auth),('editors',edit),tempB))
        return res

########################
#
# Helper functions to write to db
#
########################

def createPublication(text):
    params = text.publication()
    pubTemp, created  = Publication.objects.update_or_create(
        **params
    )
    return pubTemp

def createAuthors(text,publication):
    autList = []
    for author in text.authors():
        autTemp, created = Author.objects.update_or_create(
            **author
            )
        autTemp.publications.add(publication)
        autList.append(autTemp)
    return autList

def createChapters(text,publication):
    chptList = []
    for chapter in text.chapters():
        parent = chapter.pop('part')
        #print(parent)
        chapter['publication'] = publication
        chpTemp, created = Chapter.objects.update_or_create(
            **chapter
        )
        chptList.append(chpTemp)
    return chptList

def createSections(text,publication):
    secList = []
    for section in text.sections():
        try:
            full = section
            chapID = section.pop('chapter')
            chapter = Chapter.objects.get(idstring=chapID)
            section['publication'] = publication
            section['chapter'] = chapter
            secTemp, created = Section.objects.update_or_create(
                **section
            )
            secList.append(secTemp)
        except:
            print('Section has no chapter as parent\n {0}'.format(full))
            pass
    return secList

def createParagraphs(text,publication):
    parList = []
    for paragraph in text.paragraphs():
        #print('{0}{1}{2}'.format('\n',paragraph,'\n'))
        mix, created = MixedContent.objects.update_or_create(
            position = 0,
            paragraphid = paragraph['paragraphid']
        )
        if 'attributes' in paragraph.keys():
            mix2, created = MixedContent.objects.update_or_create(
                parent = mix,
                position = paragraph['position'],
                tagname =  paragraph['tagname'],
                attributes = paragraph['attributes']
            )

            text2, created  = Text.objects.update_or_create(
                parent = mix2,
                position = mix2.position,
                text = paragraph['text'],
            )
        elif 'text' in paragraph.keys():
            text, created = Text.objects.update_or_create(
                parent = mix,
                position = paragraph['position'],
                text = paragraph['text']
            )
        else:
            pass

        try:
            parent = paragraph['parentid']
            if parent.startswith('chapter'):
                chapter = Chapter.objects.get(idstring=parent)
                paraTemp, created = Paragraph.objects.update_or_create(
                    publication = publication,
                    idstring = paragraph['paragraphid'],
                    content = mix,
                    chapter = chapter,
                )
            elif parent.startswith('sec'):
                section = Section.objects.get(idstring=parent)
                paraTemp, created = Paragraph.objects.update_or_create(
                    publication = publication,
                    idstring = paragraph['paragraphid'],
                    content = mix,
                    section = section,
                )
            else:
                print(parent)
                pass
        except:
            #print('Could not find parent element of: {0}'.format(paragraph))
            pass
        parList.append(paraTemp)
    return parList

def createCitations(text,publication):
    citList = []
    autList = []
    ediList = []
    for aut, edi, cit in text.citations():
        citTemp, created = Citation.objects.update_or_create(
            **cit
        )
        for author in aut[1]:
            autTemp, created = Author.objects.update_or_create(
                **author
            )
            autTemp.citations.add(citTemp)
            autList.append(autTemp)
        for editor in edi[1]:
            ediTemp, created = Author.objects.update_or_create(
                **editor
            )
            ediTemp.editions.add(citTemp)
            ediList.append(ediTemp)
    return (autList,ediList,cit)
    #     print(cit['author'])
    #     try:
    #         cit['author'].split(' ')
    #         autTemp, created = Author.objects.update_or_create(
    #             firstname = cit['author'].split(' ')[1],
    #             lastname = cit['author'].split(' ')[-1],
    #             middlename = cit['author'].split(' ')[1:-1],
    #         )
    #     except:
    #         pass
    #     cit.pop('author')
    #     citTemp, created = Citation.objects.update_or_create(
    #         **cit
    #         )
    #     try:
    #         autTemp.citations.add(citTemp)
    #         autList.append(autTemp)
    #     except:
    #         pass
    #     citList.append(citTemp)
    # return citList, autList
	from eoa.models import *
	from collections import OrderedDict
	from bs4 import BeautifulSoup
	import re


	####################
	# Importer class
	####################
	class TEIimporter():
	"""
	Importer for TEI documents to python objects.
	"""

	def __init__(self, data):
	self.soup = BeautifulSoup(open(data), features="xml")
	self.header = self.soup.findAll('teiHeader')[0]
	self.body = self.soup.findAll('body')[0]

	def publication(self):
	res = {}
	seriesInfo = [(x.attrs,x.contents[0]) for x in self.header.findAll('title',{'level':'s'})]
	res['title'] = [x.contents[0] for x in self.header.findAll('title',{'type':'main'})][0]
	try:
	res['subtitle'] = [x.contents[0] for x in self.header.findAll('title',{'type':'sub'})][0]
	except:
	pass
	res['series'] = seriesInfo[0][1].lower()
	res['publication_id'] = int(seriesInfo[0][0]['n'])
	#
	res['publisher'] = self.header.findAll('publisher')[0].contents[0]
	res['created_date'] = self.header.findAll('date')[0].contents[0]
	res['pages'] = 1
	res['price'] = 1
	return res

	def authors(self):
	resTemp = [x.contents[0] for x in self.header.findAll('author')]
	auts = [x.split(' ') for x in resTemp]
	ret = []
	for aut in auts:
	au = {}
	if len(aut) > 2:
	au['firstname'] = aut[0]
	au['lastname'] = aut[-1]
	au['middlenames'] = ' '.join(aut[1:-1])
	elif len(aut) == 2:
	au['firstname'] = aut[0]
	au['lastname'] = aut[1]
	elif len(aut) == 1:
	au['lastname'] = aut[0]
	ret.append(au)
	return ret

	def getContent(self,obj):
	conList = []
	for k in range(len(obj)):
	try:
	parent = obj[k].parent.attrs['xml:id']
	except:
	#print(obj[k])
	pass
	for i, con in enumerate(obj[k].contents):
	temp = {}
	temp['parentid'] = parent
	try:
	temp['position'] = i
	temp['paragraphid'] = con.parent.attrs['xml:id']
	except:
	pass
	if type(con).__name__ == 'Tag':
	temp['tagname'] = re.findall('(?<=<)\w+(?=\s)',con.decode())[0]
	temp['attributes'] = con.attrs
	temp['text'] = self.getContent([con])
	elif type(con).__name__ == 'NavigableString':
	temp['text'] = str(con).strip()
	conList.append(temp)
	return conList

	def findPart(self,part,keyname):
	ret = []
	parts = self.body.findAll('div',{'type':part})
	for part in parts:
	pt = {}
	titles = part.findAll('head')
	for title in titles:
	if title.parent.attrs['xml:id'] == part.attrs['xml:id']:
	pt['title'] = title.contents
	pt['idstring'] = part.attrs['xml:id']
	try:
	pt['order'] = part.attrs['n']
	except:
	pass
	try:
	pt['number'] = part.attrs['o']
	except:
	pass
	try:
	pt[keyname] = part.parent.attrs['xml:id']
	except:
	pt[keyname] = ''
	pass
	ret.append(pt)
	return ret

	def chapters(self):
	return self.findPart('chapter','part')

	def sections(self):
	return self.findPart('section','chapter')

	def subsections(self):
	return self.findPart('subsection','section')

	def subsubsections(self):
	return self.findPart('subsubsection','subsection')

	def paragraphs(self):
	paras = self.body.findAll('p')
	return self.getContent(paras)

	def citations(self):
	res = []
	for bib in self.body.findAll('biblStruct'):
	tempB = {}
	auth = []

	for author in bib.findAll('author'):
	tempA = {}
	tempA['lastname'] = author.findAll('surname')[0].contents[0]
	tempA['firstname'] = author.findAll('forename')[0].contents[0]
	auth.append(tempA)

	edit = []
	for editor in bib.findAll('editor'):
	tempA = {}
	tempA['lastname'] = editor.findAll('surname')[0].contents[0]
	tempA['firstname'] = editor.findAll('forename')[0].contents[0]
	edit.append(tempA)

	try:
	tempB['pubPlace'] = bib.findAll('pubPlace')[0].contents[0]
	except:
	pass
	try:
	tempB['publisher'] = bib.findAll('publisher')[0].contents[0]
	except:
	pass
	tempB['pubtype'] = bib.attrs['type']
	tempB['idstring'] = bib.attrs['xml:id']
	tempB['date'] = bib.findAll('date')[0].contents[0]
	notes = ''
	for notepart in bib.findAll('note'):
	notes += notepart.text.strip()
	tempB['note'] = notes
	for title in bib.findAll('title'):
	if title.attrs['level'] == 'a':
	tempB['titlea'] = title.contents[0]
	elif title.attrs['level'] in ['m','j']:
	tempB['titlem'] = title.contents[0]
	for scope in bib.findAll('biblScope'):
	tempB[scope.attrs['unit']] = scope.contents[0]

	res.append((('authors',auth),('editors',edit),tempB))
	return res

	########################
	#
	# Helper functions to write to db
	#
	########################

	def createPublication(text):
	params = text.publication()
	pubTemp, created = Publication.objects.update_or_create(
	**params
	)
	return pubTemp

	def createAuthors(text,publication):
	autList = []
	for author in text.authors():
	autTemp, created = Author.objects.update_or_create(
	**author
	)
	autTemp.publications.add(publication)
	autList.append(autTemp)
	return autList

	def createChapters(text,publication):
	chptList = []
	for chapter in text.chapters():
	parent = chapter.pop('part')
	#print(parent)
	chapter['publication'] = publication
	chpTemp, created = Chapter.objects.update_or_create(
	**chapter
	)
	chptList.append(chpTemp)
	return chptList

	def createSections(text,publication):
	secList = []
	for section in text.sections():
	try:
	full = section
	chapID = section.pop('chapter')
	chapter = Chapter.objects.get(idstring=chapID)
	section['publication'] = publication
	section['chapter'] = chapter
	secTemp, created = Section.objects.update_or_create(
	**section
	)
	secList.append(secTemp)
	except:
	print('Section has no chapter as parent\n {0}'.format(full))
	pass
	return secList

	def createParagraphs(text,publication):
	parList = []
	for paragraph in text.paragraphs():
	#print('{0}{1}{2}'.format('\n',paragraph,'\n'))
	mix, created = MixedContent.objects.update_or_create(
	position = 0,
	paragraphid = paragraph['paragraphid']
	)
	if 'attributes' in paragraph.keys():
	mix2, created = MixedContent.objects.update_or_create(
	parent = mix,
	position = paragraph['position'],
	tagname = paragraph['tagname'],
	attributes = paragraph['attributes']
	)

	text2, created = Text.objects.update_or_create(
	parent = mix2,
	position = mix2.position,
	text = paragraph['text'],
	)
	elif 'text' in paragraph.keys():
	text, created = Text.objects.update_or_create(
	parent = mix,
	position = paragraph['position'],
	text = paragraph['text']
	)
	else:
	pass

	try:
	parent = paragraph['parentid']
	if parent.startswith('chapter'):
	chapter = Chapter.objects.get(idstring=parent)
	paraTemp, created = Paragraph.objects.update_or_create(
	publication = publication,
	idstring = paragraph['paragraphid'],
	content = mix,
	chapter = chapter,
	)
	elif parent.startswith('sec'):
	section = Section.objects.get(idstring=parent)
	paraTemp, created = Paragraph.objects.update_or_create(
	publication = publication,
	idstring = paragraph['paragraphid'],
	content = mix,
	section = section,
	)
	else:
	print(parent)
	pass
	except:
	#print('Could not find parent element of: {0}'.format(paragraph))
	pass
	parList.append(paraTemp)
	return parList

	def createCitations(text,publication):
	citList = []
	autList = []
	ediList = []
	for aut, edi, cit in text.citations():
	citTemp, created = Citation.objects.update_or_create(
	**cit
	)
	for author in aut[1]:
	autTemp, created = Author.objects.update_or_create(
	**author
	)
	autTemp.citations.add(citTemp)
	autList.append(autTemp)
	for editor in edi[1]:
	ediTemp, created = Author.objects.update_or_create(
	**editor
	)
	ediTemp.editions.add(citTemp)
	ediList.append(ediTemp)
	return (autList,ediList,cit)
	# print(cit['author'])
	# try:
	# cit['author'].split(' ')
	# autTemp, created = Author.objects.update_or_create(
	# firstname = cit['author'].split(' ')[1],
	# lastname = cit['author'].split(' ')[-1],
	# middlename = cit['author'].split(' ')[1:-1],
	# )
	# except:
	# pass
	# cit.pop('author')
	# citTemp, created = Citation.objects.update_or_create(
	# **cit
	# )
	# try:
	# autTemp.citations.add(citTemp)
	# autList.append(autTemp)
	# except:
	# pass
	# citList.append(citTemp)
	# return citList, autList