diff --git a/pid_resolver_lib/doi_ra_handler.py b/pid_resolver_lib/doi_ra_handler.py index 6ac31c4..c91bb8d 100644 --- a/pid_resolver_lib/doi_ra_handler.py +++ b/pid_resolver_lib/doi_ra_handler.py @@ -24,7 +24,7 @@ RAs: Dict[str, Dict[str, Union[str, int]]] = { 'DataCite': {'mime': 'application/ld+json', 'sleep': 120}, 'Crossref': {'mime': 'application/rdf+xml', 'sleep': 0}, - 'mEDRA': {'mime': 'application/rdf+xml', 'sleep': 0} + 'mEDRA': {'mime': 'application/vnd.medra.onixdoi+xml', 'sleep': 0} } REGISTRATION_AGENCY = 'RA:' diff --git a/pid_resolver_lib/pid_analyzer.py b/pid_resolver_lib/pid_analyzer.py index f2805a4..31c53e5 100644 --- a/pid_resolver_lib/pid_analyzer.py +++ b/pid_resolver_lib/pid_analyzer.py @@ -282,8 +282,9 @@ def analyze_doi_record_crossref(cache_dir: Path, doi: str, orcid_info: Dict[str, def analyze_author_info_medra(creator: etree.Element, namespace_map: Any, orcid_info: List[OrcidProfile]) -> Optional[AuthorInfo]: - given_name_ele: Optional[etree.Element] = creator.find('.//foaf:givenName', namespaces=namespace_map) - family_name_ele: Optional[etree.Element] = creator.find('.//foaf:familyName', namespaces=namespace_map) + given_name_ele: Optional[etree.Element] = creator.find('.//NamesBeforeKey', namespaces=namespace_map) + family_name_ele: Optional[etree.Element] = creator.find('.//KeyNames', namespaces=namespace_map) + orcid_ele: Optional[etree.Element] = creator.find('.//NameIdentifier/IDValue', namespaces=namespace_map) orcid: Optional[str] origin_orcid: Optional[str] @@ -292,9 +293,12 @@ def analyze_author_info_medra(creator: etree.Element, namespace_map: Any, orcid_ given_name = given_name_ele.text.strip() family_name = family_name_ele.text.strip() - orcid, origin_orcid = _match_name_with_orcid_profile(orcid_info, given_name, family_name) - - return AuthorInfo(given_name=given_name, family_name=family_name, orcid=orcid, origin_orcid=origin_orcid, ror=None) + if orcid_ele is not None: + orcid = _get_orcid_id_from_url(orcid_ele.text) + return AuthorInfo(given_name=given_name, family_name=family_name, orcid=orcid, origin_orcid='doi', ror=None) + else: + orcid, origin_orcid = _match_name_with_orcid_profile(orcid_info, given_name, family_name) + return AuthorInfo(given_name=given_name, family_name=family_name, orcid=orcid, origin_orcid=origin_orcid, ror=None) # return None if insufficient information is provided. return None @@ -304,12 +308,13 @@ def analyze_doi_record_medra(cache_dir: Path, doi: str, orcid_info: Dict[str, Li try: rec_str = read_from_cache(doi, cache_dir) - root = etree.fromstring(rec_str) + # encode to bytes because of Unicode strings with encoding declaration + root = etree.fromstring(rec_str.encode()) - title_ele: Optional[etree.Element] = root.find('.//bibo:Article/dc:title', namespaces=root.nsmap) + title_ele: List[etree.Element] = root.xpath('.//onix:Title[parent::onix:ContentItem|parent::onix:DOIMonographicProduct and onix:TitleType[contains(text(), "01")]][1]/onix:TitleText', namespaces={'onix': 'http://www.editeur.org/onix/DOIMetadata/2.0'}) - if title_ele is not None: - title = title_ele.text.strip() + if len(title_ele) == 1: + title = title_ele[0].text.strip() else: title = None @@ -318,7 +323,8 @@ def analyze_doi_record_medra(cache_dir: Path, doi: str, orcid_info: Dict[str, Li else: orcid_author_info = [] - creators: List[etree.Element] = root.findall('.//dc:creator/foaf:Person', namespaces=root.nsmap) + # set prefix for namespace used in whole file + creators: List[etree.Element] = root.xpath('.//onix:Contributor[onix:ContributorRole[contains(text(), "A01")]]', namespaces={'onix': 'http://www.editeur.org/onix/DOIMetadata/2.0'}) authors: List[Optional[AuthorInfo]] = list( map(lambda creator: analyze_author_info_medra(creator, root.nsmap, orcid_author_info), creators)) diff --git a/tests/test_pid_analyzer.py b/tests/test_pid_analyzer.py index 526dbee..7e2fc01 100644 --- a/tests/test_pid_analyzer.py +++ b/tests/test_pid_analyzer.py @@ -108,12 +108,13 @@ def test_analyze_doi_record_medra(self): assert res is not None assert res.doi == '10.26342/2020-64-4' - assert res.title == 'Predicting the humorousness of tweets using gaussian process\n preference learning' + assert res.title == 'Predicting the humorousness of tweets using gaussian process preference learning' assert len(res.authors) == 4 - assert res.authors[0].given_name == 'Edwin' - assert res.authors[0].family_name == 'Simpson' + assert res.authors[0].given_name == 'Tristan' + assert res.authors[0].family_name == 'Miller' + assert res.authors[0].orcid == '0000-0001-6157-8808' def test_get_orcids_from_resolved_dois(self): diff --git a/tests/testdata/medra_test.xml b/tests/testdata/medra_test.xml index 19d55f5..bba0332 100644 --- a/tests/testdata/medra_test.xml +++ b/tests/testdata/medra_test.xml @@ -1,81 +1,98 @@ - - - 44 - - 10.26342/2020-64-4 - - Predicting the humorousness of tweets using gaussian process - preference learning - - - - Simpson - - Edwin - - Edwin Simpson - - - - 37 - - - Do Dinh - Erik-Lân - - Erik-Lân Do Dinh - - - - - 44 - - - 1989-7553 - - urn:issn:1989-7553 - - 1989-7553 - - 1989-7553 - - Procesamiento del Lenguaje Natural - - - - - 2020 - 10.26342/2020-64-4 - - info:doi/10.26342/2020-64-4 - - 37 - - - Gurevych - - Iryna - - Iryna Gurevych - - - - doi:10.26342/2020-64-4 - 10.26342/2020-64-4 - - - Miller - - Tristan - - Tristan Miller - - - - Sociedad Española para el Procesamiento del Lenguaje Natural - - \ No newline at end of file + + +
+ mEDRA + medra@medra.org + PublicService + 20240620 +
+ + 06 + 10.26342/2020-64-4 + http://journal.sepln.org/sepln/ojs/ojs/index.php/pln/article/view/6193 + SEPLN + mEDRA + + + + <TitleType>01</TitleType> + <TitleText>Procesamiento del Lenguaje Natural</TitleText> + + + 01 + Sociedad Española para el Procesamiento del Lenguaje Natural + + ES + + + + 07 + 1989-7553 + + JD + + + + + 05 + 2020 + + + + + + 37 + 44 + + + + <TitleType>01</TitleType> + <TitleText>Predicting the humorousness of tweets using gaussian process preference learning</TitleText> + + + 1 + A01 + + 21 + http://orcid.org/0000-0001-6157-8808 + + Tristan Miller + Miller, Tristan + Tristan + Miller + + + 2 + A01 + Erik-Lân Do Dinh + Do Dinh, Erik-Lân + Erik-Lân + Do Dinh + + + 3 + A01 + Edwin Simpson + Simpson, Edwin + Edwin + Simpson + + + 4 + A01 + Iryna Gurevych + Gurevych, Iryna + Iryna + Gurevych + + + 01 + eng + + 2020 + + +
\ No newline at end of file