autopubs.py

#! /usr/bin/env python3

import os
from pprint import pprint
import re
import sys

import requests


def parse_organization(name):
    # "Herwig Lab/Bioinformatics, Dept. of Computational Molecular Biology (Head: Martin "
    # "Vingron), Max Planck Institute for Molecular Genetics, Max Planck Society"
    match = re.match(r"([^,]+),.*Max Planck Institute for Molecular Genetics", name, flags=re.I)
    if not match:
        return None
    unit = match.group(1).strip()
    unit = re.sub(r'^([^/]+)/.*', r'\1', unit).strip()
    return unit


OU_URL = {

    'Aktas Lab': 'https://www.molgen.mpg.de/aktas-lab',
    'Arndt Lab': 'https://www.molgen.mpg.de/arndt-lab',
    'Bulut-Karslioglu Lab': 'https://www.molgen.mpg.de/bulut-karslioglu-lab',
    'Herrmann Lab': 'https://www.molgen.mpg.de/herrmann-lab',
    'Herwig Lab': 'https://www.molgen.mpg.de/herwig-lab',
    'Ibrahim Lab': 'https://www.molgen.mpg.de/ibrahim-lab',
    'Kalscheuer Lab': 'https://www.molgen.mpg.de/kalscheuer-lab',
    'Kinkley Lab': 'https://www.molgen.mpg.de/kinkley-lab',
    'Kraushar Lab': 'https://www.molgen.mpg.de/kraushar-lab',
    'Kretzmer Lab': 'https://www.molgen.mpg.de/kretzmer-lab',
    'Mayer Lab': 'https://www.molgen.mpg.de/en/mayer-lab',
    'Meissner Lab': 'https://www.molgen.mpg.de/meissner-lab',
    'Metzger Lab': 'https://www.molgen.mpg.de/metzger-lab',
    'Müller Lab': 'https://www.molgen.mpg.de/mueller-lab',
    'Mundlos Lab': 'https://www.molgen.mpg.de/mundlos-lab',
    'Ralser Lab':  'https://www.molgen.mpg.de/ralser-lab',
    'Reinert Lab': 'https://www.molgen.mpg.de/reinert-lab',
    'Smith Lab': 'https://www.molgen.mpg.de/en/smith-lab',
    'Vallier Lab': 'https://www.molgen.mpg.de/vallier-lab',
    'Vingron Lab': 'https://www.molgen.mpg.de/en/vingron-lab',
    'Yaspo Lab': 'https://www.molgen.mpg.de/yaspo-lab',

    'Multi-Level Gene Control (Denes Hnisz)': 'https://www.molgen.mpg.de/hnisz-lab',

    'Mass Spectrometry': 'https://www.molgen.mpg.de/mass-spectrometry',
    'Microscopy and Cryo Electron Microscopy': 'https://www.molgen.mpg.de/microscopy',
    'Sequencing': 'https://www.molgen.mpg.de/sequencing',
}


def add_ou_href(name):
    url = OU_URL.get(name)
    if url is not None:
        return f'<a href={url}>{name}</a>'
    if name not in OU_URL:
        print(f"WARING: no link for ou '{name}'")
        OU_URL[name] = None
    return name


def get_entries():

    # https://pure.mpg.de/rest/swagger-ui.html
    # https://colab.mpdl.mpg.de/mediawiki/PubMan_REST_API_Documentation

    BASE = 'https://pure.mpg.de/rest'

    # create search with https://pure.mpg.de/pubman/faces/AdvancedSearchPage.jsp
    # For example:
    #    Organization: "Max Planck Institute for Molecular Genetics, Max Planck Society"
    #  AND
    #    Kontext: "Publications of the MPI for Molecular Genetics"
    #  AND
    #    Datum  Von: 2025-01-01
    #
    # now: "Abfrage in REST-Schnittstelle übernehmen"
    # now: sort by lastModificationDate descending, max records: 100, export format json
    # download curl command

    EA_QUERY = {
      "bool": {"must": [
        {"term": {"publicState": {"value": "RELEASED"}}},
        {"term": {"versionState": {"value": "RELEASED"}}},
        {"bool": {"must": [
          {"bool": {"should": [
            {"term": {"metadata.creators.person.organizations.identifierPath": {"value": "ou_1433545"}}},
            {"term": {"metadata.creators.organization.identifierPath": {"value": "ou_1433545"}}}]}},
          {"term": {"context.objectId": {"value": "ctx_1479061"}}},
          {"bool": {"should": [
            {"range": {"metadata.datePublishedInPrint": {"gte": "2025-03-01||/d"}}},
            {"range": {"metadata.datePublishedOnline": {"gte": "2025-03-01||/d"}}},
            {"range": {"metadata.dateAccepted": {"gte": "2025-03-01||/d"}}},
            {"range": {"metadata.dateSubmitted": {"gte": "2025-03-01||/d"}}},
            {"range": {"metadata.dateModified": {"gte": "2025-03-01||/d"}}},
            {"range": {"metadata.dateCreated": {"gte": "2025-03-01||/d"}}}]}}]}}]}
    }

    payload = {
        'query': EA_QUERY,
        "sort": [{"lastModificationDate": {"order": "desc"}}],
        "size": "1000",
        "from": "0"
    }

    headers = {
        'Cache-Control': 'no-cache',
        'Content-Type': 'application/json',
    }

    entries = []

    response = requests.post(BASE + '/items/search?format=json', json=payload, headers=headers)
    response.raise_for_status()

    o = response.json()
    records = o['records']
    for record in records:
        # pprint(record)

        data = record['data']

        metadata = data['metadata']

        creators = metadata['creators']
        datePublishedOnline = metadata.get('datePublishedOnline', None)
        datePublishedInPrint = metadata.get('datePublishedInPrint', None)
        identifiers = metadata.get('identifiers', [])
        sources = metadata.get('sources', [])
        title = metadata['title']

        if datePublishedOnline is None:
            if datePublishedInPrint is None:
                continue
            else:
                datePublished = datePublishedInPrint
        else:
            if datePublishedInPrint is None:
                datePublished = datePublishedOnline
            else:
                datePublished = datePublishedInPrint if datePublishedInPrint < datePublishedOnline else datePublishedOnline

        authors = []
        ou_names = set()
        for creator in creators:
            person = creator['person']                           # {...}
            role = creator['role']                               # 'AUTHOR'
            type = creator['type']                               # 'PERSON'

            if role != "AUTHOR" or type != "PERSON":
                continue

            familyName = person['familyName']                    # 'Altay'
            givenName = person['givenName']                      # 'Aybuge'
            orcid = person.get(('orcid'), None)                  # 'https://orcid.org/0000-0001-7341-7091'
            organizations = person.get('organizations', [])      # [...]
            for organization in organizations:
                ou_name = parse_organization(organization['name'])
                if ou_name is not None:
                    ou_names.add(ou_name)

            name = f"{familyName}, {givenName}".strip()
            if orcid is None:
                authors.append(name)
            else:
                authors.append(f'<b><a href="{orcid}">{name}</a></b>')

        doi = None
        for identifier in identifiers:
            id = identifier['id']                               # '10.1093/nargab/lqae135 '
            type = identifier['type']                           # 'DOI'
            if type == "DOI":
                doi = id.strip()
                continue

        if doi is None:
            continue

        source_text = None
        for source in sources:
            issue = source.get('issue', None)                   # '4'
            src_title = source['title']                         # 'NAR: genomics and bioinformatics'
            volume = source.get('volume', None)                 # '6'
            source_text = src_title
            if volume is not None:
                source_text += f" volume {volume}"
            if issue is not None:
                source_text += f" issue {issue}"
            continue

        if source_text is None:
            continue

        entries.append({
            'datePublished': datePublished,
            'authors': '; '.join(authors),
            'title': title,
            'doi': doi,
            'source': source_text,
            'ou_names': ou_names,
        })
    return entries


def update_django(entries):
    import django

    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings.local')

    sys.path.append('/project/intranet/mpicms')
    django.setup()

    from mpicms.publications.models import Publication

    #Publication.objects.all().delete()

    for entry in sorted(entries, key=lambda e: e['datePublished']):
        template = {
            'title': "<b>" + entry['title'] + "</b>",
            'authors': entry['authors'],
            'source': entry['source'],
            'groups': " ".join(map(add_ou_href, sorted(entry['ou_names']))),
        }
        (o, created) = Publication.objects.get_or_create(doi=entry['doi'], defaults=template)
        if created:
            print(f"created: {entry['title']}")
        else:
            need_to_save = False
            for key, value in template.items():
                if getattr(o, key) != value:
                    setattr(o, key, value)
                    need_to_save = True
            if need_to_save:
                o.save()
                print(f"updated: {entry['title']}")


entries = get_entries()
update_django(entries)