diff --git a/.gitignore b/.gitignore index 817c11d..e9b30c8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,10 @@ +output +examples + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] +*$py.class # C extensions *.so @@ -42,6 +46,7 @@ htmlcov/ nosetests.xml coverage.xml *,cover +.hypothesis/ # Translations *.mo @@ -49,6 +54,14 @@ coverage.xml # Django stuff: *.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy # Sphinx documentation docs/_build/ @@ -58,3 +71,25 @@ target/ # Macos turds .DS_Store + +# IPython Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# dotenv +.env + +# virtualenv +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject diff --git a/LICENSE b/LICENSE index 0f34066..2e0375b 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2017 Max Planck Institute for the History of Science +Copyright (c) 2016-2018 Max Planck Institute for the History of Science Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 3a9b38d..512989d 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ you the PDF version of the document. Next, comment line 9 in `EOASample.tex` (the EOA preambel) and uncomment line 10 (the XML preambel) and run the older version of -biber (biber v2.1). +biber (biber v2.1). biber_2.1 EOASample @@ -48,5 +48,15 @@ If everything went well, you can also try and run These scripts don't take any arguments and will produce output in the `CONVERT` directory. +# Convert TEI to EOADjango # +Suite of functions to get from TEI encoded XML into the workflow of Edition Open Access. The main output file is an XML file called `IntermediateXML.xml` which can subsequently processed with `tralics2django`, a tool found in the `EOASkripts` repository. +Code written in Python3. + +External dependencies +--------------------- +- lxml +- BeautifulSoup +- pandoc +- pandoc-citeproc diff --git a/data/exampleTEI.xml b/data/exampleTEI.xml new file mode 100644 index 0000000..441a6d9 --- /dev/null +++ b/data/exampleTEI.xml @@ -0,0 +1,617 @@ + + + + + + + + + Studies + Der ewige Testband + Experimentell + Klaus Thoden + NN + Lindy Divarci + NN + Myself + + + + + + + Edition Open Access + pro-business.com + 2016-08-04 + 978-3-945561-XXX + DOI + + + + by-nc-sa + + + + Klein anfangen. + Groß enden. + + + + + + + MPRL + Edition Open Access + + + + + Deutsch + + + + + +
+ + + + Chemical Laboratory. This idealized laboratory with metallurgical furnaces is from William Lewis, Commercium Philosophico-Technicum (London, 1756). Courtesy of Smith Image Collection, Van Pelt Dietrich Library, University of Pennsylvania. +
+ + +
+ Max Planck Research Library for the History and Development of Knowledge +
+ Series Editors +

Ian T. Baldwin, Gerd Graßhoff, Jürgen Renn, Dagmar Schäfer, Robert Schlögl, Bernard F. Schutz

+
+
+ Edition Open Access Development Team +

Lindy Divarci, Bendix Düker, Samuel Gfrörer, Klaus Thoden, Dirk Wintergrün.

+
+
+

+ The Edition Open Access (EOA) platform was founded to bring together publication initiatives seeking to disseminate the results of scholarly work in a format that combines traditional publications with the digital medium. It currently hosts the open-access publications of the “Max Planck Research Library for the History and Development of Knowledge” (MPRL) and “Edition Open Sources” (EOS). EOA is open to host other open access initiatives similar in conception and spirit, in accordance with the Berlin Declaration on Open Access to Knowledge in the sciences and humanities, which was launched by the Max Planck Society in 2003. +

+

+ By combining the advantages of traditional publications and the digital medium, the platform offers a new way of publishing research and of studying historical topics or current issues in relation to primary materials that are otherwise not easily available. The volumes are available both as printed books and as online open access publications. They are directed at scholars and students of various disciplines, and at a broader public interested in how science shapes our world. +

+
+
+ + + +
+ Max Planck Research Library for the History and Development of Knowledge +

+ The Max Planck Research Library for the History and Development of Knowledge comprises the subseries, Studies, Proceedings and Textbooks. They present original scientific work submitted under the scholarly responsibility of members of the Scientific Board and their academic peers. The initiative is currently supported by research departments of three Max Planck Institutes: the MPI for the History of Science, the Fritz Haber Institute of the MPG and the MPI for Gravitational Physics (Albert Einstein Institute). The publications of the Studies series are dedicated to key subjects in the history and development of knowledge, bringing together perspectives from different fields and combining source-based empirical research with theoretically guided approaches. The Proceedings series presents the results of scientific meetings on current issues and supports, at the same time, further cooperation on these issues by offering an electronic platform with further resources and the possibility for comments and interactions. +

+
+
+ Scientific Board +

+ Markus Antonietti, Antonio Becchi, Fabio Bevilacqua, William G. Boltz, Jens Braarvik, Horst Bredekamp, Jed Z. Buchwald, Olivier Darrigol, Thomas Duve, Mike Edmunds, Fynn Ole Engler, Robert K. Englund, Mordechai Feingold, Rivka Feldhay, Gideon Freudenthal, Paolo Galluzzi, Kostas Gavroglu, Mark Geller, Domenico Giulini, Günther Görz, Gerd Graßhoff, James Hough, Manfred Laubichler, Glenn Most, Klaus Müllen, Pier Daniele Napolitani, Alessandro Nova, Hermann Parzinger, Dan Potts, Sabine Schmidtke, Circe Silva da Silva, Ana Simões, Dieter Stein, Richard Stephenson, Mark Stitt, Noel M. Swerdlow, Liba Taub, Martin Vingron, Scott Walter, Norton Wise, Gerhard Wolf, Rüdiger Wolfrum, Gereon Wolters, Zhang Baichun. +

+
+
+ + +
+ This is the first part + + +
+ Document structure + +
+ This is the first section +

This the first section of a text. It is preceded by two hierarchical + units, namely Part and Chapter. As you can see above, the chapter + command has two arguments, the first one being the running head which + is displayed in the header of each page, the second being the text + that is printed on the page where the chapter begins.

+ +

We offer two additional units below the section: the subsection and + the subsubsection. See below how they are displayed!

+ + +
+ This is a subsection +

Here we are at an even lower hierarchical level. This can be quite + useful. Be aware that this level is not being displayed in the table + of contents. Now, there is still one level below that: the subsubsection.

+ + +
+ A subsubsection +

A subsubsection will never be numbered and like its predecessor, it + will not show up in the table of contents.

+
+
+
+
+ +
+ Unnumbered document structure +

Chapter, section and subsection can also be used without numbers. Do + you see the slight changes in the layout? Also, the counter is not + incremented. What should we use this for?

+

How can we see if the next two are section or subsection? They have + the same height. What shall we do??

+
+ Section without numbers +

This the first section of a text. It is preceded by two hierarchical + units, namely Part and Chapter. As you can see above, the chapter + command has two arguments, the first one being the running head which + is displayed in the header of each page. Structure your text wisely.

+

Now, a running head only makes sense if your chapter spreads across + several pages.

+
+ Subsection without number + +

Here we are at an even lower hierarchical level. This can be quite + useful. Be aware that this level is not being displayed in the table + of contents.

+
+
+
+ +
+ Markup +

The EOA flavour of Latex offers quite a few commands that help you + markup words in your text. And also, if you want to include words in + different writing systems, they have to be preceded by commands so + that the system can switch to the correct font. The whole scope is + gathered in the next, rather experimental section.

+
+ Non-Latin alphabets +

This section showcases text written in writing systems other than + Latin. It includes Russian, Chinese, Hebrew and Greek.

+
+ Russian + +

First, Russian: В начале двадцатого века был + одним из идеологов богостроительства, в 1909 году помогал участникам + этого течения содержать фракционную школу на острове Капри для + рабочих, которую В. И. Ленин называл "литераторским центром + богостроительства".

+
+
+ Chinese + +

Next, Chinese: 法兰克人接受了高卢罗马文化, + 改操罗曼语族语言(但在罗马人分布较少的高卢北部人多操日耳曼语族语言)。 + 克洛维定巴黎为首都,建立了新的王朝,史称墨洛温王朝,但是这个王朝在克 + 洛维死后陷入分裂,克洛维的四个儿子按照法兰克人的习惯,将法兰克国家一 + 分为四,分别是巴黎、奥尔良、苏瓦松和兰斯。

+
+
+ Hebrew + +

Then, Hebrew:ארגינעל האט די סעקרעטאריאט שטאב געדארפט צו זיין א + פאראייניגטע גרופע פון אומפארטייאישע שליחים פון יעדע לאנד וועלכע האבן + נאר די אינטערעסן פון זייערע אייגענע לענדער אבער עס האט זיך קיינמאל + נישט געהעריג אויסגעארבעט און נאר דער סעקרעטאר גענעראל איז אזוי + באטראכט געווארן

+
+
+ Greek + +

And finally Greek: Ο φλοιός κυμαίνεται μεταξύ 5 + και 70 km σε βάθος. Τα λεπτά τμήματα του φλοιού είναι κάτω από τους + ωκεανούς (ωκεάνιος φλοιός) και αποτελούνται από πυκνά πετρώματα + μαγνησίου, σιδήρου και πυριτίου. Τα παχύτερα τμήματα του φλοιού + είναι τα ηπειρωτικά τα οποία είναι λιγότερο πυκνά από τα ωκεάνια και + αποτελούνται από πετρώματα πλούσια σε νάτριο, αλουμίνιο και + πυρίτιο.

+
+
+ If in doubt + +

You can add complicated characters as images, for example, + an apple: .

+
+
+
+ Text markup + +

This is a regular paragraph. Words can appear in italics, – + these are examples of this – and some characters are + superscript or subscript. A high pitched voice called and + a deep resonating bass answered. What is the + + Math font good for? Be careful and use + EOAbold only sparingly.

+ + +

To read single Arabic words, as in + Grammaticall Paradigms, we must know the sound of the letters + […] ‘b t θ 3 כ ח L m n w h y.\EOAup{change of + \EOAhebrew{יה} into \EOAhebrew{ח}} place of the accent [\dots] + never in ultima, therefore in penultima in all disyllables as + onsur, never higher than the antepenult, and there always in + polysyllables as nasara, nasarta unlwaaw [y]e penult be made + long by quiescent by אוי as tansoranias tansoriיna + tansoraיnaAdd., f.1r.

+ +

Another paragraph showcasing some additional text marking features. + + Text can be struck through. And we can + space out some words. But small caps I know pretty well. Last, but + not least, here is the way to enter web addresses: http://www.edition-open-access.de +

+ + +

A short check on how hyperref works: + + + + + +

+
+
+ Other types of text blocks + +

If you want to include a longer quote in a text, there is of course a + command for that. And inside that we also introduce you to how you can + insert footnotes.Piaget 1985.

+ + + Do you like the lorem ipsum text? I don't.Kaulbach 1960, 320-322. Klar, man kann damit Seite + um Seite füllen, aber wenn die Leserin entsprechend geneigt + ist, will sie gelesenen Text auch lesen und verstehen + können. Und da hörts dabei einfach auf! + + +
+ Three constructs for lists + +

This is a numbered list containing three items + + This is the first entry. + This is the second entry. + This is the third entry + +

+ +

In contrast to that, here is a list that is not numbered, but also contains the same + items!

+

+ + This is the first entry. + This is the second entry. + This is the third entry + +

+ +

Thirdly, if you want to give some definitions you can use EOAdescription: + + + + A typesetting system + + This is you! + +

+
+
+
+ Smaller spacings +

Small spacings are also there: U\,S\;A\EOAindexlocation{USofA}.

+
+
+ Some technical matters +

Some letters cannot be typed directly when writing in LaTeX. For + example the tilde: \EOAtilde. We have to use a command for that. Also, + of course, as you have already seen, you cannot write the percent sign + as it is, because this is used for commentig. Use \% instead. And for + a backslash? \\? No. That's a newline. \textbackslash!

+
+
+
+ References and indices + +

Elements can be assigned labels so that we can later refer to them: + sections, images, tables. A lot of things can be cross-referened.

+ + +

By the way, JoyceJames + Joyce is a darn good author. Read some of his + stuff. It may be useful for indexing some of your key terms in + the text so that we can later link to catalogues that tells + the reader more about the things. Joyce died in ZurichZürich. You cannot + read anything about him in neither Walkowski 2016 or Carvalho + 2012! +

+ + +

Now, here's a funny problem concerning the index. Consider + one Lord Charles Cavendish, Lord Charles (son + of the second duke of Devonshire), (Fig. + + , ) + + + + duty of service + + + + We actually have references in there. Can we do this? This is the + reference outside the index: (Fig. + + , )

+ +

This paragraph illustrates various references (see section ). It includes a reference to the image (see image + ), two entries for the index (Science + Science and + + Confucius + 孔夫子), one reference to a facsimile + (see facsimile on page ) + and one reference to a section. The last one is at the beginning. + And of course: references to the bibliography: Gert 2011 [2002], 128 + + . We can also cite manually with Plat. tim. + + . +

+
+
+ Floating environments Floats + +

Floating Gamba and Andersen + 2008 environments + are bigger constructs like figures and tables. If you want to, + you can leave the placing of them to the typesetting system, + in order to avoid bigger stretches Hsu 1993 of white space.

+ +
+ Tables +

The following lines of code produce a table consisting of 4 columns + and 3 rows. Is the table right so?

+ + + + This is a table + + Heading 1 + Heading 2 + Heading 3 + Heading 4 + + + Here + you + may + find + + + some + data + spread + over + + + the + table + in + cells + +
+
+
+ Figures +

Yet another section. We have an image here. The command takes five + parameters: filename, caption, label, width and position

+ +
+ + An image with a caption. It resembles a bird looking at you. Doesn't it? And do you think it looks angry? +
+
+ + Ostafrikanische Arbeiter an einem der großen Knochengräben, die während der paläontologischen Expedition des Berliner Naturkundemuseums in die damalige Kolonie Deutsch-Ostafrika zwischen 1909 und 1913 angelegt wurden. (Koloriertes Glasdiapositiv, Museum für Naturkunde Berlin, Historische Bild- u. Schriftgutsammlungen, Bestand: Pal. Mus., Signatur: B V/177) +
+ + + +

The fat-free version of this is EOAfigurenonumber which does without a + caption and a label. We are using the same source image here, but by + specifying the size of the image, it will scale down accordingly.

+ + +
+ +
+ + +

The last image command – EOAlsfigure – takes three parameters the figure covers the whole page

+
+ + That's a caption for the lovely landscape image +
+
+
+
+ Mathematics + + + + + +

Let's have some mathematics examples here. Latex is very good in + typesetting formulæ, so let's go for it! We already had this example, + but here is another inline equation $\sqrt{9} = 3$. Well, our + root symbol expands the line spacing a bit, so maybe we should not use + it inline. Here is another equation: $2^2 = 4$.

+ +

Instead, an equation outside of a paragraph.

+

+ $6 + 6 = 12$ +

+ +

+ $g_{\mu \nu} = 7$ +

+ + +

And if it does not need to be numbered. They also don't take labels (why?).

+

+ $6 + 6 = 12$ +

+ + + +

+ $6 + 6 = 12\\ + 12 + 12 = 24\\ + 24 + 24 = 48$ +

+ + + + + +

An array of equations. All of them numbered individually.

+

+ $1 + 1 = 2\\ + 2 + 2 = 4\\ + 4 + 4 = 8$ +

+ + + +

Again, an array of equations without numbers. They don't get labels, either.

+

+ $1 + 1 = 2\\ + 2 + 2 = 4\\ + 4 + 4 = 8$ +

+ + + +

Let's insert an empty page here.

+ +
+
+ Transcription and translation + + + +
+
+ Bibliography + + +
+ + + + + + + +
+ +
+
diff --git a/data/images/0002.jpg b/data/images/0002.jpg new file mode 100644 index 0000000..eb5fc71 Binary files /dev/null and b/data/images/0002.jpg differ diff --git a/data/images/002.jpg b/data/images/002.jpg new file mode 100755 index 0000000..80cc5c3 Binary files /dev/null and b/data/images/002.jpg differ diff --git a/data/images/1.jpg b/data/images/1.jpg new file mode 100755 index 0000000..b13827a Binary files /dev/null and b/data/images/1.jpg differ diff --git a/data/images/1_rot.jpg b/data/images/1_rot.jpg new file mode 100644 index 0000000..11978a1 Binary files /dev/null and b/data/images/1_rot.jpg differ diff --git a/data/inline/A.jpg b/data/inline/A.jpg new file mode 100644 index 0000000..b70dd66 Binary files /dev/null and b/data/inline/A.jpg differ diff --git a/data/tei2django.xsl b/data/tei2django.xsl new file mode 100644 index 0000000..f3f97dc --- /dev/null +++ b/data/tei2django.xsl @@ -0,0 +1,274 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Footnotes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + quoted + + + + + + + + + + + + popover + + + citation + + + popover + + + true + + + + + + bottom + + + + + + + + + + + + + + + + + + + + + + + + + + + + + footnote + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/mkimage.py b/mkimage.py new file mode 100644 index 0000000..1446397 --- /dev/null +++ b/mkimage.py @@ -0,0 +1,231 @@ +#!/usr/bin/python3 +# -*- coding: utf-8; mode: python -*- +__version__ = "1.0" +__date__ = "20170323" +__author__ = "kthoden@mpiwg-berlin.mpg.de" + +import os +import sys +import logging +import configparser +import textwrap +import argparse +from PIL import Image, ImageFont, ImageDraw + +logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') + +DIMENSIONS = (2000, 2844) # ratio of 0.703205791106515 +BACKGROUND = 0, 0, 0 + +METADATA_DICT = {'eoa_series': 'Studies', 'eoa_number': '125', + 'eoa_authors': ['Klaus Thoden'], 'eoa_title': + 'Der ewige Testband', 'eoa_subtitle': + 'Experimentell'} + +SERIES_COLOURS = {"sources" : (40, 96, 49), + "studies" : (13, 40, 72), + "proceedings" : (173, 54, 50), + "textbooks" : (210, 182, 35)} + +def get_cover_image(image_path): + """Choose a random landscape image from publications in this volume""" + + import random + + candidates = os.listdir(image_path) + + for image in candidates: + tmp_image = Image.open(image_path + "/" + str(image)) + ratio = calculate_ratio(tmp_image) + if ratio < 1: + candidates.remove(image) + + chosen_image = random.choice(candidates) + + return chosen_image +# def get_cover_image ends here + +def calculate_ratio(image_object): + """Determine the aspect ratio of an image""" + + width, height = image_object.size + + ratio = float(width)/float(height) + + return ratio +# def calculate_ratio ends here + +def resize_image(image_object, max_size, dimension): + """Resize an image, preserve ratio. + + Takes three arguments, an image object, the maximal size and the + dimension (width or height). + + https://stackoverflow.com/questions/273946/how-do-i-resize-an-image-using-pil-and-maintain-its-aspect-ratio + """ + + width, height = image_object.size + + if dimension == "height": + height_percent = (max_size/float(height)) + wsize = int((float(width)*float(height_percent))) + resized_image = image_object.resize((wsize, max_size), Image.ANTIALIAS) + elif dimension == "width": + width_percent = (max_size/float(width)) + hsize = int((float(height)*float(width_percent))) + resized_image = image_object.resize((max_size, hsize), Image.ANTIALIAS) + else: + print("You must either specify height or width as dimension. Exiting.") + sys.exit(0) + + return resized_image +# def resize_image ends here + +def format_authors(authors_list): + """Format the list of authors + + Input is the start and end point of the authors in a list. Return + both a formatted string and the pure list of authors. + """ + + if len(authors_list) == 0: + authors_as_string = "" + if len(authors_list) == 1: + authors_as_string = """%s""" % (authors_list[0]) + elif len(authors_list) == 2: + authors_as_string = """%s and %s""" % (authors_list[0], authors_list[1]) + elif len(authors_list) > 2: + authors_as_string = """%s""" % authors_list[0] + for author in range(1, len(authors_list) - 1): + authors_as_string += ", " + authors_list[author] + authors_as_string += " and %s" % (authors_list[-1]) + + return authors_as_string +# def format_authors ends here + +def add_watermark(image, watermarkstring): + """Add a string of text across the cover. Return a rotated image object""" + + # https://codenhagen.wordpress.com/2015/12/04/putting-rotated-text-on-images-with-pillow-python/ + + base_image = Image.open(image) + + tmp_img = Image.new("RGBA", (DIMENSIONS[1], DIMENSIONS[0]), (0,0,0,0)) + font_colour = (255,0,0) + big_red_font = ImageFont.truetype("Helvetica", 200) + text_canvas = ImageDraw.Draw(tmp_img) + + text_canvas.text((0, 0), watermarkstring, font=big_red_font, fill=font_colour) + + slanted_image = tmp_img.rotate(60, expand=True) + + # add third parameter as transparent mask + # https://stackoverflow.com/questions/5324647/how-to-merge-a-transparent-png-image-with-another-image-using-pil + base_image.paste(slanted_image, (200, 100), slanted_image) + + base_image.save(image) + print("Added a watermark to", image) + # return slanted_image +# def add_watermark ends here + +def centered(textstring, font_spec): + """Return coordinates for a centered string.""" + + tmp_draw = ImageDraw.Draw(Image.new("RGB", DIMENSIONS, BACKGROUND)) + + string_width, string_height = tmp_draw.textsize(textstring, font=font_spec) + + coordinate = DIMENSIONS[0] / 2 - string_width / 2 + + return coordinate +# def centered ends here + +def create_cover(metadata_dict, image_directory, cover_filename): + """Create a cover using PIL""" + + img = Image.new("RGB", DIMENSIONS, BACKGROUND) + + upper_part = Image.new("RGB", (DIMENSIONS[0], int(DIMENSIONS[1]/3)), SERIES_COLOURS[metadata_dict['eoa_series'].lower()]) + img.paste(upper_part, (0, 0)) + + title_text = metadata_dict['eoa_title'] + subtitle_text = metadata_dict['eoa_subtitle'] + authors_text = format_authors(metadata_dict['eoa_authors']) + series_number_text = "{0} {1}".format(metadata_dict['eoa_series'], metadata_dict['eoa_number']) + + if metadata_dict['eoa_series'].lower() == "sources": + press_text = "Edition Open Sources" + else: + press_text = "Max Planck Research Library for the History and Development of Knowledge" + + if metadata_dict['eoa_series'].lower() == "textbooks": + fill_colour_top = (0, 0, 0) + else: + fill_colour_top = (255, 255, 255) + + big_bold_font = ImageFont.truetype(font="Times New Roman Bold", size=120) + medium_font = ImageFont.truetype(font="Times New Roman", size=100) + small_font = ImageFont.truetype(font="Times New Roman", size=80) + + text_draw = ImageDraw.Draw(img) + + # these will eventually also become candidates for multilines + text_draw.text((centered(title_text, big_bold_font), 200), title_text, font=big_bold_font, fill=fill_colour_top) + text_draw.text((centered(subtitle_text, medium_font), 350), subtitle_text, font=medium_font, fill=fill_colour_top) + text_draw.text((centered(authors_text, small_font), int(DIMENSIONS[1]/3)-200), authors_text, font=small_font, fill=fill_colour_top) + + press_text_lines = textwrap.wrap(press_text, width=40) + press_text_lines.append(series_number_text) + + press_text_joined = "\n".join(press_text_lines) + ptcenter = centered(press_text_joined, small_font) + + text_draw.multiline_text((ptcenter,DIMENSIONS[1]-400), press_text_joined, font=small_font, align="center") + + image_on_cover = Image.open(os.path.join(image_directory, get_cover_image(image_directory))) + + MAXIMUM_HEIGHT = 1200 + + resized_image = resize_image(image_on_cover, MAXIMUM_HEIGHT, "height") + + coord = DIMENSIONS[0]/2 - resized_image.width/2 + img.paste(resized_image, (int(coord), 1200)) + + img.save(cover_filename) + print("Wrote", cover_filename) + + watermark = add_watermark(cover_filename, "Automatically generated cover.\nDo not use in production!") +# def create_cover ends here + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("image_dir", help="Path to directory with potential images.") + parser.add_argument("-c", "--config", help="File that contains the publication data.", default="publication.cfg") + parser.add_argument("-o", "--output", help="Name of output file.", default="Cover.jpg") + args = parser.parse_args() + + IMAGES = args.image_dir + + if os.path.exists("publication.cfg"): + logging.debug("Using %s as publication config" % (args.config)) + config = configparser.ConfigParser() + + config.read(args.config) + + list_of_authors = list(set(config['Authors'].values())) + for author in list_of_authors: + if len(author) == 0: + list_of_authors.remove(author) + + METADATA_DICT.update({'eoa_series' : config['Technical']['Serie']}) + METADATA_DICT.update({'eoa_number' : config['Technical']['Number']}) + METADATA_DICT.update({'eoa_title' : config['Technical']['Title']}) + METADATA_DICT.update({'eoa_subtitle' : config['Technical']['Subtitle']}) + METADATA_DICT.update({'eoa_authors' : list_of_authors}) + else: + logging.debug("Using the built-in metadata as publication config") + + OUTFILE = args.output + + create_cover(METADATA_DICT, IMAGES, OUTFILE) +# finis diff --git a/prepare_tei.md b/prepare_tei.md new file mode 100644 index 0000000..2e28e0f --- /dev/null +++ b/prepare_tei.md @@ -0,0 +1,19 @@ +# Document preparation +Conversion of docx documents to TEI XML + +Used metypeset with parameters `--prettytei --puretei`. This tool, however, removes the div structure that is important for the sectioning of the work. + +Therefore, another attempt with oxgarage which retains the div structure. + +# Handling of citations +We use bibtex to store bibliographic data. When producing PDF, we can use the LaTeX tools to format citations and references. + +For the HTML view, a similar workflow was used (tralics etc), but the output format of biber has been changed, we have not yet adapted to it. + +One can use pandoc in conjunction with pandoc-citeproc to do the formatting. + +The prepare_tei.py script produces a markdown file that only contains the references and being run with + + pandoc -o ldaston.html -t html --filter=pandoc-citeproc --bibliography=03_daston.bib 03_daston-citations.md + +Will produce an easily parseable html file that we can use to extract the formatted bibliography and references from. diff --git a/prepare_tei.py b/prepare_tei.py new file mode 100644 index 0000000..bff26ab --- /dev/null +++ b/prepare_tei.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8; mode: python -*- + +__version__ = "1.0" +__date__ = "20180109" +__author__ = "kthoden@mpiwg-berlin.mpg.de" + +import sys +import os +import re +import json +import logging +import shlex +import pickle +import subprocess +from lxml import etree +from datetime import datetime +import bibtexparser +import argparse +import traceback + +logging.basicConfig(level=logging.INFO, format=' %(asctime)s - %(levelname)s - %(message)s') + +# do things like in latex2eoa: search and replace things by regex +# also, delete elements and attributes inserted by metypeset +# and, rename elements according to our schema + +# treat +# assignment of identifiers + +ns_tei = "http://www.tei-c.org/ns/1.0" +NS_MAP = {"t" : ns_tei} + +def parse_bibtex(bibfile): + """Parse the bibtex file, return a dict""" + + all_references = {} + + with open(bibfile) as btf: + btb = bibtexparser.load(btf) + tmp_dict = btb.entries_dict + + all_references.update(tmp_dict) + + return tmp_dict + # return all_references +# def parse_bibtex ends here + +def unescape(text): + """Remove HTML or XML character references and entities from a text + string. Return a Unicode string. + + With thanks to http://effbot.org/zone/re-sub.htm#unescape-html. + Modified to work with Python3. + """ + import re, html.entities + + def fixup(m): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return chr(int(text[3:-1], 16)) + else: + return chr(int(text[2:-1])) + except ValueError: + pass + else: + # named entity + try: + text = chr(html.entities.name2codepoint[text[1:-1]]) + except KeyError: + pass + return text # leave as is + return re.sub(r"&#?\w+;", fixup, text) +# def unescape ends here + +def convert_citations(string): + """Find citation shorthand using regex. + + Return a tuple of the modified string and a list of found citations + + In a second step, parse the result and return citekey and pagerange + (if present)). + + + + + + """ + inline_citation_pattern = re.compile(r"(§|§|§)(§|§|§)(?P.+?)(\!(?P.*?))?(§|§|§)(§|§|§)") + inline_citations = re.findall(inline_citation_pattern, string) + logging.info("Found %s inline citations." % len(inline_citations)) + string = re.sub(inline_citation_pattern, r"", string) + + citation_pattern = re.compile(r"(§|§|§)(?P.+?)(\!(?P.*?))?(§|§|§)") + citations = re.findall(citation_pattern, string) + logging.info("Found %s citations." % len(citations)) + string = re.sub(citation_pattern, r"", string) + + return (string, citations) +# def convert_citations ends here + +def parse_cited_range(list_of_xml_elements): + """citedRange: split up parameters or remove element if attributes are empty""" + + unsplittable_pageref = [] + + for reference in list_of_xml_elements: + cited_range = reference.find("t:citedRange", namespaces=NS_MAP) + from_value = (cited_range.get("from")) + + split_values = re.findall(r"[\w']+", from_value) + if len(from_value) == 0: + cited_range.tag = "tagtobestripped" + cited_range.attrib.pop("from") + elif len(split_values) == 1: + cited_range.set("from", split_values[0]) + elif len(split_values) == 2: + cited_range.set("from", split_values[0]) + cited_range.set("to", split_values[1]) + elif len(split_values) == 3: + cited_range.set("from", split_values[0]) + cited_range.set("to", split_values[2]) + else: + logging.info("Splitting the page range produced unexpected result. Tried to split %s" % from_value) + unsplittable_pageref.append(from_value) + + return unsplittable_pageref +# def parse_cited_range ends here + +def validate_citations(used_citekeys, bibdata): + """Check if all found citekeys are in the database + + Return a list of unavailable citekeys.""" + + available_citekeys = bibdata.keys() + + no_citekey = [] + + for citekey in used_citekeys: + if citekey not in available_citekeys: + no_citekey.append(citekey) + logging.info("%s is not in the bibliographic database" % citekey) + + return no_citekey +# def validate_citations ends here + +def convert_figures(string): + """Find figures shorthands""" + + # negative lookbehind assertion. Real + characters must be escaped by \ + graphic_pattern = re.compile(r"(?]]>", string) + + return string +# def convert_figures ends here + +def make_figure_elements(list_of_figures, figure_directory): + """Construct the figure element.""" + + bad_images = [] + available_images = [] + available_images_long = os.listdir(figure_directory) + + for img in available_images_long: + available_images.append(os.path.splitext(img)[0]) + + for graphic in list_of_figures: + parent_tag = graphic.getparent() + parent_tag.tag = "figure" + + original_string = graphic.text + graphic.clear() + + parts = original_string.split("!") + + if len(parts) in range(2,4): + if parts[0] in available_images or parts[0] in available_images_long: + selected_image = parts[0] + logging.info("Found %s in the text. Selected %s as corresponding image." % (parts[0], selected_image)) + graphic.set("scale", "50") + graphic.set("url", "images/" + selected_image) + else: + bad_images.append(original_string) + + caption = "" + parts[1] + "" + head_element = etree.fromstring(caption) + parent_tag.insert(1, head_element) + + if len(parts) == 3: + logging.info("This figure contains hyperimage directions") + yenda_command = etree.Comment("Hyperimage direction: %s" % parts[2]) + parent_tag.append(yenda_command) + + else: + logging.info("The figure string could not be split by '!': %s" % etree.tostring(graphic)) + + return bad_images +# def make_figure_elements ends here + +def cleanup_xml(xml_tree): + """Perform some cleaning on XML""" + + # also, delete elements and attributes inserted by metypeset + # and, rename elements according to our schema + + metypeset_attrib = xml_tree.findall("//t:*[@meTypesetSize]", namespaces=NS_MAP) + color_attrib = xml_tree.xpath("//t:hi[contains(@rend, 'color') or contains(@rend, 'background')]", namespaces=NS_MAP) + + logging.info("Found %s metypesets." % len(metypeset_attrib)) + logging.info("Found %s colour attributes." % len(color_attrib)) + + for attribute in metypeset_attrib: + logging.info("number of attributes: %s" % len(attribute.attrib)) + attribute.attrib.pop("meTypesetSize") + + for attribute in color_attrib: + attribute.attrib.pop("rend") + + hi_without_attrib2 = xml_tree.findall("//t:hi", namespaces=NS_MAP) + + for attribute in hi_without_attrib2: + if len(attribute.attrib) == 0: + xml_parent = attribute.getparent() + attribute.tag = "tagtobestripped" + + footnotes = xml_tree.xpath("//t:note[@place='foot']", namespaces=NS_MAP) + for footnote in footnotes: + footnote.set("place", "bottom") + + etree.strip_tags(xml_tree, "tagtobestripped") + + return xml_tree +# def cleanup_xml ends here + +def fix_document_structure(xml_tree): + """Insert div types""" + + # Unsure here, but maybe have a rule that one file is one chapter, + # so the highest level would be sections + + # chapter_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP) + # section_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP) + # subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP) + # subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP) + + # for chapter in chapter_divs: + # chapter.set("type", "chapter") + # for section in section_divs: + # section.set("type", "section") + # for subsection in subsection_divs: + # subsection.set("type", "subsection") + # for subsubsection in subsubsection_divs: + # subsubsection.set("type", "subsubsection") + + section_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP) + subsection_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP) + subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP) + + for section in section_divs: + section.set("type", "section") + for subsection in subsection_divs: + subsection.set("type", "subsection") + for subsubsection in subsubsection_divs: + subsubsection.set("type", "subsubsection") + + +# def fix_document_structure ends here + +def fix_tei_header(xml_tree, bibfile_string): + """Populate TEI header with mandatory data""" + + title_statement = xml_tree.xpath("//t:titleStmt", namespaces=NS_MAP)[0] + series = etree.SubElement(title_statement, "title", level="s", n="20").text = "Studies" + main_title = etree.SubElement(title_statement, "title", type="main").text = "FotoObjekte" + # subtitle = etree.SubElement(title_statement, "title", level="sub").text = "Artikelsammlung" + + publication_statement = xml_tree.xpath("//t:publicationStmt", namespaces=NS_MAP)[0] + pub_date = etree.SubElement(publication_statement, "date", when=datetime.now().strftime("%Y-%m-%d")) + availability = etree.SubElement(publication_statement, "availability") + licence = etree.SubElement(availability, "licence", target="https://creativecommons.org/licenses/by-nc-sa/3.0/de/deed.en") + licence.text = "by-nc-sa" + # licence_text = etree.SubElement(licence, "p").text = """Distributed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Germany License.""" + + source_desc = xml_tree.xpath("//t:sourceDesc", namespaces=NS_MAP)[0] + bibfile = etree.SubElement(source_desc, "listBibl", source=bibfile_string, type="monograph") + + profile_desc = etree.SubElement(xml_tree, "profileDesc") + langusage = etree.SubElement(profile_desc, "langUsage") + language = etree.SubElement(langusage, "language", ident="en").text = "English" + return xml_tree +# def fix_tei_header ends here + +def plural(num, noun): + """Return singular or plural form of noun, depending on num. + + Works only when a noun's plural is formed with 's'. """ + + if num == 1: + return noun + else: + return noun + "s" +# def plural ends here + +def evaluate_report(report): + """Print report of conversion.""" + + print("="*60) + print(' '*4, "Conversion report") + print("-"*60) + if len(report["bad_figures"]) > 0: + print("{} {} could not be linked to a file in the image directory:".format(len(report["bad_figures"]), plural(len(report["bad_figures"]), "figure"))) + for item in report["bad_figures"]: + print(' '*4, item) + else: + print("All figures were linked.") + if len(report["not_cited"]) > 0: + print("{} {} could not be found in the bibliography database:".format(len(report["not_cited"]), plural(len(report["not_cited"]), "citation"))) + for item in report["not_cited"]: + print(' '*4, item) + else: + print("All citekeys were found in the bibliography database.") + if len(report["bad_pageref"]) > 0: + print("{} page {} could not be parsed into start and end value:".format(len(report["bad_pageref"]), plural(len(report["bad_pageref"]), "reference"))) + for item in report["bad_pageref"]: + print(' '*4, item) + else: + print("All page references could be parsed into discrete values.") + print("="*60) +# def evaluate_report ends here + +def main(): + """The main bit""" + + parser = argparse.ArgumentParser() + parser.add_argument("teifile", help="Output from oxgarage/metypeset, an TEI XML file.") + parser.add_argument("bibfile", help="The bibliography database of the publication.") + parser.add_argument("figdir", help="The directory that contains the figures belonging to the publication.") + args = parser.parse_args() + + with open(args.teifile, 'r') as xmlfile: + xml_tree = etree.parse(xmlfile) + + report = {} + + ################ + # bibliography # + ################ + # bibtexparser + bibdata = parse_bibtex(args.bibfile) + + xml_cleaned = cleanup_xml(xml_tree) + xml_cleaned.write("cleaned.xml", pretty_print=True, xml_declaration=True, encoding="utf-8") + logging.info("Wrote cleanup.xml") + + # first some modifications on a string object + xml_string = etree.tostring(xml_cleaned).decode('utf-8') + mod_string, cited = convert_citations(xml_string) + + used_citekeys = [unescape(c[1]) for c in cited] + not_cited = validate_citations(used_citekeys, bibdata) + + report["not_cited"] = not_cited + + mod_string2 = convert_figures(mod_string) + + debug_output = args.teifile.replace(".xml", "-modified.xml") + with open(debug_output, "w") as debugfile: + debugfile.write(mod_string2) + logging.info("Wrote %s." % debug_output) + + # check for wellformedness, read again as xml + try: + xml_tree2 = etree.fromstring(mod_string2) + except etree.XMLSyntaxError: + print("\nXML syntax error when trying to parse modified tree. Dumped it to %s." % debug_output) + print("-"*60) + traceback.print_exc(file=sys.stdout) + print("-"*60) + exit() + + all_figures = xml_tree2.xpath("//t:graphic", namespaces=NS_MAP) + bad_figures = make_figure_elements(all_figures, args.figdir) + + report["bad_figures"] = bad_figures + + all_references = xml_tree2.xpath("//t:bibl", namespaces=NS_MAP) + bad_pageref = parse_cited_range(all_references) + + report["bad_pageref"] = bad_pageref + + etree.strip_tags(xml_tree2, "tagtobestripped") + + tei_header = xml_tree2.xpath("//t:teiHeader", namespaces=NS_MAP) + fix_tei_header(tei_header[0], str(args.bibfile)) + + dictChapters = {} + dictEquations = {} + dictLists = {} + dictTheorems = {} + dictFigures = {} + dictSections = {} + dictFootnotes = {} + dictTables = {} + dictPagelabels = {} + + data_to_pickle = {'citekeys' : used_citekeys, + 'chapterdict' : dictChapters, + 'eqdict' : dictEquations, + 'listdict' : dictLists, + 'theoremdict' : dictTheorems, + 'figdict' : dictFigures, + 'secdict' : dictSections, + 'fndict' : dictFootnotes, + 'tabdict' : dictTables, + 'pagelabeldict' : dictPagelabels} + + if not os.path.exists("tmp_files/"): + os.makedirs(os.path.expanduser("tmp_files/")) + + with open('tmp_files/data.pickle', 'wb') as f: + # Pickle the 'data' dictionary using the highest protocol available. + pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL) + + fix_document_structure(xml_tree2) + # output + output = args.teifile.replace(".xml", "-out.xml") + tree = etree.ElementTree(xml_tree2) + tree.write(output, pretty_print=True, xml_declaration=True,encoding="utf-8") + logging.info("Wrote %s." % output) + + evaluate_report(report) +# def main ends here + +if __name__ == '__main__': + main() +# finis diff --git a/tei2django.py b/tei2django.py new file mode 100644 index 0000000..c0c9215 --- /dev/null +++ b/tei2django.py @@ -0,0 +1,217 @@ +#!/usr/bin/python3 +# -*- coding: utf-8; mode: python -*- +__version__ = "1.0" +__date__ = "20170315" +__author__ = "kthoden@mpiwg-berlin.mpg.de" +__doc__ = """A converter from TEI to Django.""" + +import sys +import os +import configparser +from datetime import datetime +from lxml import etree +import mkimage + +OUTPUT_DIR = "./CONVERT" +XSL_FILE = os.path.dirname(sys.argv[0]) + "/data/tei2django.xsl" +FIGURE_DIR = "./data/images" + +def process_formulas(xml_tree): + """Process formulas""" + + """ + Format of filenames: EOAineq_12_62.png chapter number + """ + + pass +# def process formulas ends here + +def get_publication_info(xml_tree): + """Query the TEI document for metadata fields. + + Return a dictionary""" + + info_dict = {} + + ns_tei = "http://www.tei-c.org/ns/1.0" + ns_cc = "http://web.resource.org/cc/" + ns_rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" + NS_MAP = {"t" : ns_tei, "c" : ns_cc, "r" : ns_rdf} + + def get_field(xml_tree, query_path, mandatory=False, findall=False): + """Query XML for metadata fields. + + Default behaviour is if it fails, move on, if mandatory is set + to True, exit the program + """ + + if findall is True: + find_several = xml_tree.findall(query_path, namespaces=NS_MAP) + if len(find_several) == 1: + return_string = [find_several[0].text] + else: + return_string = [x.text for x in find_several] + else: + tmp_field = xml_tree.xpath(query_path, namespaces=NS_MAP) + if len(tmp_field) > 0: + return_string = tmp_field[0] + else: + if mandatory is True: + sys.exit("Field stored in %s is mandatory. Exiting." % query_path) + else: + return_string = "" + + return return_string + # def get_field ends here + + # Mandatory values (according to database schema) + info_dict['eoa_publicationdate'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:date/@when", mandatory=True) + info_dict['eoa_language'] = get_field(xml_tree, "//t:teiHeader/t:profileDesc/t:langUsage/t:language/@ident", mandatory=True) + info_dict['eoa_license'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:availability/t:licence/text()", mandatory=True) + info_dict['eoa_number'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@level='s']/@n", mandatory=True) + info_dict['eoa_series'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@level='s']/text()", mandatory=True) + info_dict['eoa_title'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@type='main']/text()", mandatory=True) + + # Optional (according to database schema) + info_dict['eoa_subtitle'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@type='sub']/text()") + info_dict['eoa_isbn'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:idno[@type='ISBN']/text()") + info_dict['eoa_price'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:extent/t:measure[@unit='EUR']/@quantity") + info_dict['eoa_shoplink_url'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/@xml:base") + info_dict['eoa_shoplink_id'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/@xml:id") + info_dict['eoa_shoplink_text'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/text()") + info_dict['eoa_brief_desc'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='BriefDescription']/text()") + info_dict['eoa_detail_desc'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='DetailedDescription']/text()") + info_dict['eoa_additional_info'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='additionalinformation']/text()") + info_dict['eoa_dedication'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='dedication']/text()") + + info_dict['eoa_submitters'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='submitter']", findall=True) + info_dict['eoa_publicationmanagers'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='publicationmanager']", findall=True) + info_dict['eoa_publicationassistants'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='publicationassistant']", findall=True) + info_dict['eoa_editorialcoordinators'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='editorialcoordinator']", findall=True) + info_dict['eoa_copyeditors'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='copyeditor']", findall=True) + info_dict['eoa_translators'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='translator']", findall=True) + info_dict['eoa_keywords'] = get_field(xml_tree, "//t:teiHeader/t:profileDesc/t:textClass/t:keywords/t:list/t:item", findall=True) + info_dict['eoa_authors'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:author", findall=True) + + return info_dict +# def get_publication_info ends here + +def populate_config_file(info_dict, config): + """Parse the XML header and write it in config file.""" + + # set up three main bits + config['Technical'] = {} + technical_config = config['Technical'] + config['General'] = {} + general_config = config['General'] + config['Authors'] = {} + authors_config = config['Authors'] + + date_object = datetime.strptime(info_dict['eoa_publicationdate'], "%Y-%m-%d") + + # fill in the fields + technical_config['Serie'] = info_dict['eoa_series'] #ok + technical_config['Number'] = info_dict['eoa_number'] #ok + technical_config['Title'] = info_dict['eoa_title'] #ok + technical_config['Subtitle'] = info_dict['eoa_subtitle'] #ok + technical_config['PublicationDate'] = info_dict['eoa_publicationdate'] #ok + technical_config['PublicationYear'] = datetime.strftime(date_object, "%Y") + technical_config['ISBN'] = info_dict['eoa_isbn'] #ok + technical_config['Price'] = info_dict['eoa_price'] #ok + technical_config['Shoplink'] = """{2}""".format(info_dict['eoa_shoplink_url'], info_dict['eoa_shoplink_id'].replace("id_", ""), info_dict['eoa_shoplink_text']) #ok + technical_config['Language'] = info_dict['eoa_language'] #ok + technical_config['License'] = info_dict['eoa_license'] #ok + + general_config['BriefDescription'] = info_dict['eoa_brief_desc'] #ok + general_config['Submitter'] = ", ".join(info_dict['eoa_submitters']) #ok + general_config['PublicationManagment'] = ", ".join(info_dict['eoa_publicationmanagers']) + general_config['PublicationAssistants'] = ", ".join(info_dict['eoa_publicationassistants']) + + if len(info_dict['eoa_keywords']) > 8: + sys.exit("Too many Keywords. Up to 8 are allowed. Exiting.") + else: + for keyword in info_dict['eoa_keywords']: + keyword_label = "Keyword" + str(info_dict['eoa_keywords'].index(keyword) + 1) + general_config[keyword_label] = keyword + + general_config['DetailedDescription'] = info_dict['eoa_detail_desc'] #ok + general_config['AdditionalInformation'] = info_dict['eoa_additional_info'] #ok + general_config['EditorialCoordination'] = ", ".join(info_dict['eoa_editorialcoordinators']) + general_config['Copyediting'] = ", ".join(info_dict['eoa_copyeditors']) + general_config['Dedication'] = info_dict['eoa_dedication'] #ok + general_config['Translator'] = ", ".join(info_dict['eoa_translators']) + + if len(info_dict['eoa_authors']) > 5: + sys.exit("Too many authors. Up to 5 are allowed. Exiting.") + else: + for entry in range(0, 5): + author_label = "Author" + str(entry + 1) + try: + authors_config[author_label] = info_dict['eoa_authors'][entry] + except IndexError: + authors_config[author_label] = "" + + authors_config['Zusatz'] = "" + + return config +# def populate_config_file ends here + +def write_publication_config(publication_dict): + """Main function""" + + config = configparser.ConfigParser(delimiters=(':')) + # https://stackoverflow.com/questions/1611799/preserve-case-in-configparser + config.optionxform=str + publication_config = populate_config_file(publication_dict, config) + + output_filename = OUTPUT_DIR + "/publication.cfg" + with open(output_filename, 'w') as configfile: + publication_config.write(configfile) + print("Wrote", output_filename) +# def write_publication_config ends here + +def write_django_xml(return_string): + """Write the output of XSL transformation to file""" + + output_filename = OUTPUT_DIR + "/Django.xml" + + with open(output_filename, 'w') as djangofile: + djangofile.write(str(return_string)) + + print("Wrote", output_filename) +# def write_django_xml ends here + +def xsl_for_body(xml_file, xsl_file): + """Perform XSL transformation of body. + + Return XSLT result tree.""" + + xml_tree = etree.parse(xml_file) + parsed_xsl_file = etree.parse(xsl_file) + transformer = etree.XSLT(parsed_xsl_file) + result_tree = transformer(xml_tree) + + return result_tree +# def xsl_for_body ends here + +if __name__ == '__main__': + if len(sys.argv) == 1: + print("You must specify an input file!") + sys.exit() + elif len(sys.argv) > 2: + print("You can work with only one publication at a time!") + sys.exit() + + tei_document = sys.argv[-1] + + xml_tree = etree.parse(tei_document) + publication_dict = get_publication_info(xml_tree) + + if not os.path.exists(OUTPUT_DIR): + os.mkdir(os.path.expanduser(OUTPUT_DIR)) + + write_publication_config(publication_dict) + # mkimage.create_cover(publication_dict, FIGURE_DIR, OUTPUT_DIR + "/Cover.jpg") + body_transformed = xsl_for_body(tei_document, XSL_FILE) + write_django_xml(body_transformed) +# finis diff --git a/transform_xml.py b/transform_xml.py new file mode 100644 index 0000000..2197cd4 --- /dev/null +++ b/transform_xml.py @@ -0,0 +1,501 @@ +#!/usr/bin/python3 +# -*- coding: utf-8; mode: python -*- +__version__ = "1.0" +__date__ = "20180116" +__author__ = "kthoden@mpiwg-berlin.mpg.de" +__doc__ = """A converter from TEI to Django.""" + +import os +import sys +import logging +import json +import subprocess +import pickle +import shlex +from bs4 import BeautifulSoup +from lxml import etree, objectify +from lxml.html import soupparser + +# things to be done +# assign ids top to bottom for the following elements: +# div1 div2 div3 note item table EOAfigure EOAequation formula theorem + +logging.basicConfig(level=logging.INFO, format=' %(asctime)s - %(levelname)s - %(message)s') + +ns_tei = "http://www.tei-c.org/ns/1.0" +NS_MAP = {"t" : ns_tei} +OUTPUT_DIR = "./tmp_files" +CSL_FILE = "/Users/kthoden/EOAKram/dev/eoa-csl/eoa.csl" + +# this is duplicated from libeoaconvert +dictLangFootnotes = {"it" : "Note a piè pagina", "fr" : "notes en bas de page", "de" : "Fußnoten", "en" : "Footnotes"} + +# this is duplicated from libeoaconvert +def two_letter_language(language_string): + """Return a two letter code for a language""" + + if language_string in ["english", "en"]: + return "en" + elif language_string in ["german", "deutsch", "de"]: + return "de" + elif language_string in ["french", "fr"]: + return "fr" + elif language_string in ["italian", "it"]: + return "it" +# two_letter_language ends here + +def render_reference(list_of_xml_elements, cited_data): + """Provide an attribute for a formatted version of Reference. + + This will be used for output formats that don't have a bibliographic + formatter themselves + """ + + for reference in list_of_xml_elements: + citekey = reference.xpath("t:ref/@target", namespaces=NS_MAP)[0][1:] + + # here we need to get a formatted version of the entry, like it + # would appear in the typeset version. + # looked at: bibulous + # pandoc-citeproc, maybe + + element = etree.SubElement(reference, "abbr", type="authoryear") + element.text = cited_data[citekey][1] + element = etree.SubElement(reference, "abbr", type="title") + element.text = cited_data[citekey][2] +# def render_reference ends here + +def format_citations(used_citekeys, bibdata): + """Return a formatted entry of the used citations""" + + md_file_header = "---\nlang: en\ntitle: Citations\n...\n\n" + + with open(OUTPUT_DIR + os.path.sep + "used_citations.md", "w") as citation_formatter: + citation_formatter.write(md_file_header) + # citation_formatter.write("# Full parentheses\n") + citation_formatter.write("# citeauthoryear\n") + for entry in used_citekeys: + citation_formatter.write("[@%s]\n" % entry) + citation_formatter.write("\n# citeyear\n") + for entry in used_citekeys: + citation_formatter.write("[-@%s]\n" % entry) + # sentencestyle + citation_formatter.write("\n# yearparen\n") + for entry in used_citekeys: + citation_formatter.write("@%s\n" % entry) + citation_formatter.write("\n# References\n") + + with open(OUTPUT_DIR + os.path.sep + "formatted_citations.html", "r") as ding: + cites = BeautifulSoup(ding, "html.parser") + with open(OUTPUT_DIR + os.path.sep + "formatted_citations.html", "r") as ding: + reference_list = soupparser.fromstring(ding, features="html.parser") + + # references = dd.xpath("//div[@class='references']") + # with open("tmp_files/formatted_citations.html", "r") as ding: + + references = reference_list.xpath("//div[@class='references']")[0] + + # full_paren_cites = cites.select("#full-parentheses ~ p > span") + # year_paren_cites = cites.select("#year-parentheses ~ p > span") + + citation_dict = {} + + for entry in used_citekeys: + for entry_2 in bibdata: + if entry_2["id"] == entry: + current_citation = entry + strTitle = entry_2["title"] + + title = strTitle + full_paren = cites.select("#citeauthoryear ~ p > span[data-cites='%s']" % entry)[0].text + year_paren = cites.select("#yearparen ~ p > span[data-cites='%s']" % entry)[0].text + citation_dict[entry] = (full_paren, year_paren, title) + + return citation_dict, references +# def format_citations ends here + +def format_pagerange(pagerange_start, pagerange_end): + """Parse valuse of citedRange attributes. Return formatted string""" + + return_string = "" + + if pagerange_start is not None: + return_string += pagerange_start + if pagerange_end is not None: + return_string += "–" + pagerange_end + + return return_string +# def format_pagerange ends here + +def transform_body(xml_tree, cited_data, publang): + """Transform the body of XML document into EOADjango file""" + + ###################### + # Document structure # + ###################### + + # unclean solution + chapter_element = xml_tree[0] + chapter_element.tag = "div1" + chapter_element.set("language", publang) + + eoa_chapters = xml_tree.xpath("//t:div[@type='chapter']", namespaces=NS_MAP) + for chapter in eoa_chapters: + chapter.tag = "div1" + chapter.set("language", publang) + + eoa_sections = xml_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP) + for section in eoa_sections: + section.tag = "div2" + + eoa_subsections = xml_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP) + for subsection in eoa_subsections: + subsection.tag = "div3" + + eoa_subsubsections = xml_tree.xpath("//t:div[@type='subsubsection']", namespaces=NS_MAP) + for subsubsection in eoa_subsubsections: + subsubsection.tag = "div4" + + + ############## + # Paragraphs # + ############## + + eoa_paragraphs = xml_tree.xpath("//t:p[not(@rend='footnote text')]", namespaces=NS_MAP) + + for paragraph in eoa_paragraphs: + paragraph.tag = "p" + + if paragraph.get("rend") == "Quote": + paragraph.set("rend", "quoted") + + ############# + # Citations # + ############# + + # we need some data of the references here! + """ + +Intermediate XML: +Monti + + """ + + eoa_citations = xml_tree.xpath("//t:bibl", namespaces=NS_MAP) + + for citation in eoa_citations: + pagerange = "" + cited_range = citation.xpath("t:citedRange", namespaces=NS_MAP) + citeref = citation.xpath("t:ref", namespaces=NS_MAP) + citekey = citeref[0].get("target")[1:] + citeref[0].tag = "tagtobestripped" + + citation.tag = "span" + citation.set("rel", "popover") + citation.set("class", "citation") + citation.set("citekey", citekey) + citation.set("data-toggle", "popover") + citation.set("html", "true") + citation.set("data-placement", "bottom") + + if len(cited_range) > 0: + pagerange_start = cited_range[0].get("from") + pagerange_end = cited_range[0].get("to") + pagerange = ", " + format_pagerange(pagerange_start, pagerange_end) + cited_range[0].tag = "tagtobestripped" + + formatted_citation = cited_data[citekey][0] + pagerange + citation.set("data-title", formatted_citation) + citation.set("data-content", cited_data[citekey][2]) + + citation.text = formatted_citation + + ############# + # Footnotes # + ############# + + eoa_footnotes = xml_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP) + + """ + +

One reads + """ + + for footnote in eoa_footnotes: + # re-assign tag here to get rid of namespace + footnote.tag = "note" + footnote.set("place", "Inline") + footnote.set("id-text", footnote.get("n")) + + fn_parent = footnote.getparent() + # we assert here that the parent of a footnote is always a paragraph + assert(fn_parent.tag == "p") + + fn_paragraphs = footnote.xpath("t:p", namespaces=NS_MAP) + for fn_paragraph in fn_paragraphs: + fn_paragraph.tag = "p" + del fn_paragraph.attrib["rend"] + + ########### + # Figures # + ########### + + """ +

Latin inscription on a wall in Caceres, Spain. CIL II 697
+ + + +

+ An example of the titles + images/Figure1-1_BenedettiSignature.jpg + 60 +

+
+ + """ + + figure_counter = 1 + + eoa_figures = xml_tree.xpath("//t:figure", namespaces=NS_MAP) + + for figure in eoa_figures: + figure.tag = "EOAfigure" + figure.set("id", "anotheruid") + + anchor_element = etree.SubElement(figure, "anchor") + # anchor_element.set("id-text", "id-text") + + # careful, caption can contain markup! + caption_element = figure.xpath("t:head", namespaces=NS_MAP)[0] + caption_element.tag = "caption" + + fig_p_element = etree.SubElement(figure, "p") + figure_file = etree.SubElement(fig_p_element, "file").text = figure.xpath("t:graphic/@url", namespaces=NS_MAP)[0] + figure_width = etree.SubElement(fig_p_element, "width").text = "60" #whatever + fig_p_element.append(caption_element) + + etree.strip_elements(figure, "{%s}graphic" % ns_tei) + + ############## + # Hi-Element # + ############## + eoa_hi = xml_tree.xpath("//t:hi", namespaces=NS_MAP) + + for hi in eoa_hi: + rend_attribute = hi.get("rend") + + if rend_attribute == "italic": + hi.set("rend", "it") + elif rend_attribute == "sup": + hi.tag = "EOAup" + del hi.attrib["rend"] + elif rend_attribute == "sub": + hi.tag = "EOAdown" + del hi.attrib["rend"] + else: + logging.info("The rend attribute in hi has the value %s. This is not supported" % rend_attribute) + + return xml_tree +# def transform_body ends here + +def assign_ids(xml_tree, data): + """Walk the xml tree again. Assign ids to xml and put them into dicts, as well.""" + + chapterdict = {} + figdict = {} + eqdict = {} + fndict = {} + listdict = {} + pagelabeldict = {} + secdict = {} + tabdict = {} + theoremdict = {} + + chapter_counter = 1 + xml_chapters = xml_tree.xpath("//div1") + for chapter in xml_chapters: + equation_counter = 1 + footnote_counter = 1 + list_counter = 1 + section_counter = 1 + table_counter = 1 + theorem_counter = 1 + + if chapter.get('rend') != "nonumber": + chapter.set("id-text", str(chapter_counter)) + chapterdict[chapter.get("id")] = str(chapter_counter) + + figure_anchors = chapter.findall(".//EOAfigure/anchor") + figure_counter = 1 + for anchor in figure_anchors: + figure_number = "%d.%d" % (chapter_counter, figure_counter) + + anchor.set("id-text", figure_number) + figure_counter += 1 + + figure_element = anchor.getparent() + figure_element.set("id", anchor.get("id")) + figdict[anchor.get("id")] = figure_number + + footnotes = chapter.findall(".//note") + for footnote in footnotes: + fndict[footnote.get("id")] = footnote.get("n") + + sections = chapter.findall(".//div2") + section_counter = 1 + for section in sections: + section_number = "%d.%d" % (chapter_counter, section_counter) + section.set("id-text", section_number) + secdict[section.get("id")] = section_number + + subsection_counter = 1 + subsections = section.findall(".//div3") + for subsection in subsections: + subsection_number = "%d.%d.%d" % (chapter_counter, section_counter, subsection_counter) + subsection.set("id-text", subsection_number) + secdict[subsection.get("id")] = subsection_number + subsection_counter += 1 + + section_counter += 1 + chapter_counter += 1 + + # not implemented yet: equation, list, pagelabel, tab, theorem + + data["chapterdict"] = chapterdict + data["figdict"] = figdict + data["eqdict"] = eqdict + data["fndict"] = fndict + data["listdict"] = listdict + data["pagelabeldict"] = pagelabeldict + data["secdict"] = secdict + data["tabdict"] = tabdict + data["theoremdict"] = theoremdict + + return xml_tree, data +# def assign_ids ends here + +def add_bibliography(xml_tree, refs_for_bib_chapter): + """Add another chapter containing the bibliography.""" + + root_element = xml_tree.getroot() + + xml_chapters = root_element.xpath("//div1") + number_of_chapters = len(xml_chapters) + bibliography_chapter = etree.Element("div1", rend="nonumber", language="english") + # this needs to be configurable by language + bib_head = etree.SubElement(bibliography_chapter, "head").text = "Bibliography" + bib_div_1 = etree.SubElement(bibliography_chapter, "div") + bib_div_2 = etree.SubElement(bib_div_1, "div") + + entries = refs_for_bib_chapter.findall(".//div") + + for entry in entries: + entry_id = entry.get("id") + entry.set("class", "bibliography") + etree.strip_tags(entry, "p") + entry.tag = "p" + internal_markup = entry.findall(".//em") + for markup in internal_markup: + markup.tag = "i" + + bib_div_2.append(entry) + + root_element.insert(number_of_chapters + 1, bibliography_chapter) + + return root_element +# def add_bibliography ends here + +if __name__ == '__main__': + if len(sys.argv) == 1: + print("You must specify an input file!") + sys.exit() + elif len(sys.argv) > 2: + print("You can work with only one publication at a time!") + sys.exit() + + with open(OUTPUT_DIR + os.path.sep + 'data.pickle', 'rb') as f: + data = pickle.load(f) + + used_citekeys = data["citekeys"] + + tei_document = sys.argv[-1] + xml_tree = etree.parse(tei_document) + + bib_data = {} + + publication_language = xml_tree.xpath("//t:teiHeader/t:profileDesc/t:langUsage/t:language/@ident", namespaces=NS_MAP)[0] + bib_data["source"] = xml_tree.xpath("//t:teiHeader/t:fileDesc/t:sourceDesc/t:listBibl/@source", namespaces=NS_MAP)[0] + bib_data["type"] = xml_tree.xpath("//t:teiHeader/t:fileDesc/t:sourceDesc/t:listBibl/@type", namespaces=NS_MAP)[0] + + # json + interim_bib_json_file = "tmp-bib.json" + citeproc_command = "pandoc-citeproc --bib2json %s" % bib_data["source"] + citeproc_arguments = shlex.split(citeproc_command) + citeproc_process = subprocess.Popen(citeproc_arguments, stdout=subprocess.PIPE) + citeproc_json = citeproc_process.stdout.read() + citations_json = json.loads(citeproc_json) + + if bib_data["type"] not in ["monograph", "anthology", "monograph-numeric", "anthology-numeric"]: + print("The bibliography type %s is not allowed." % bib_data["type"]) + + command = "pandoc -o %sformatted_citations.html -t html --filter=pandoc-citeproc --bibliography=%s --csl=%s %s" % (OUTPUT_DIR + os.path.sep, bib_data["source"], CSL_FILE, OUTPUT_DIR + os.path.sep + "used_citations.md") + arguments = shlex.split(command) + logging.info("Using external command pandoc.") + subprocess.call(arguments) + + # refs for bib_chapter contains formatted reference entries + cited_dict, refs_for_bib_chapter = format_citations(set(used_citekeys), citations_json) + # render_reference(all_references, cited_dict) + + tei_body = xml_tree.xpath("//t:text", namespaces=NS_MAP)[0] + + body_transformed = transform_body(tei_body, cited_dict, publang=publication_language) + + resulting_tree = etree.ElementTree(body_transformed) + xml_add_bib = add_bibliography(resulting_tree, refs_for_bib_chapter) + + etree.strip_tags(xml_add_bib, "tagtobestripped") + + elements_with_ids = xml_add_bib.xpath("//div1 | //div2 | //div3 | //note | //item | //table | //EOAfigure/anchor | //EOAequation | //formula | //theorem") + element_counter = 1 + for element in elements_with_ids: + element.set("id", "uid" + str(element_counter)) + element_counter += 1 + + assigned_ids, data_to_pickle = assign_ids(resulting_tree, data) + xml_root = assigned_ids.getroot() + + xml_root.tag = "Book" + + final_tree = etree.ElementTree(xml_root) + # objectify.deannotate(final_tree, cleanup_namespaces=True) + # etree.cleanup_namespaces(xml_root) + + with open(OUTPUT_DIR + os.path.sep + 'data.pickle', 'wb') as f: + # Pickle the 'data' dictionary using the highest protocol available. + pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL) + + if not os.path.exists("CONVERT"): + os.mkdir(os.path.expanduser("CONVERT")) + if not os.path.exists("debug"): + os.mkdir(os.path.expanduser("debug")) + + if not os.path.exists(OUTPUT_DIR): + os.mkdir(os.path.expanduser(OUTPUT_DIR)) + output_filename = OUTPUT_DIR + os.path.sep + "IntermediateXMLFile.xml" + + # resulting_tree.write(output_filename, pretty_print=True, xml_declaration=True,encoding="utf-8") + final_tree.write(output_filename, pretty_print=True, xml_declaration=True,encoding="utf-8") + logging.info("Wrote %s." % output_filename) + + bad_ns_string = 'xmlns="http://www.tei-c.org/ns/1.0"' + with open(output_filename, 'r') as textfile: + xml_as_string = textfile.read() + + removed_namespace = xml_as_string.replace(bad_ns_string, "") + + with open(output_filename, 'w') as amended_textfile: + amended_textfile.write(removed_namespace) +# finis