From 6b69d092fa0deb4c1d9a43b06dce363c0d38047a Mon Sep 17 00:00:00 2001 From: the-superpirate Date: Sun, 2 May 2021 10:45:22 +0300 Subject: [PATCH] - fix(nexus): Fix DOI detection in messages GitOrigin-RevId: e4d4519221319c26134d8cbbd011eca2def2b6b3 --- nexus/meta_api/README.md | 2 +- nexus/meta_api/query_extensionner/checks.py | 9 +++++---- nexus/nlptools/regex.py | 1 + 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/nexus/meta_api/README.md b/nexus/meta_api/README.md index e47d77d..a42054d 100644 --- a/nexus/meta_api/README.md +++ b/nexus/meta_api/README.md @@ -1,5 +1,5 @@ # Nexus Search: Meta API ``` -NEXUS_META_API_summa.url=http://summa bazel run -c opt binary +NEXUS_META_API_summa.url=http://summa bazel run binary ``` \ No newline at end of file diff --git a/nexus/meta_api/query_extensionner/checks.py b/nexus/meta_api/query_extensionner/checks.py index 6c01f1c..06bd294 100644 --- a/nexus/meta_api/query_extensionner/checks.py +++ b/nexus/meta_api/query_extensionner/checks.py @@ -5,6 +5,7 @@ from nexus.nlptools.regex import ( DOI_REGEX, ISBN_REGEX, NID_REGEX, + ONLY_DOI_REGEX, URL_REGEX, ) @@ -20,10 +21,10 @@ class QueryClass(Enum): def check_doi(query) -> (QueryClass, str): - # ToDo: rewrite normally, just hotfixed - if query.startswith('references:'): - return - if r := re.search(DOI_REGEX, query): + if ( + ((r := re.search(DOI_REGEX, query)) and re.search(URL_REGEX, query)) + or re.search(ONLY_DOI_REGEX, query) + ): doi = (r[1] + '/' + r[2]).lower() return { 'doi': doi, diff --git a/nexus/nlptools/regex.py b/nexus/nlptools/regex.py index f5cf3c9..732b82b 100644 --- a/nexus/nlptools/regex.py +++ b/nexus/nlptools/regex.py @@ -29,4 +29,5 @@ DOI_REGEX = re.compile(r'(10.\d{4,9})\s?/\s?([-._;()<>/:A-Za-z0-9]+[^.?\s])') ISBN_REGEX = re.compile(r'^(?:[iI][sS][bB][nN]\:?\s*)?((97(8|9))?\-?\d{9}(\d|X))$') MD5_REGEX = re.compile(r'([A-Fa-f0-9]{32})') NID_REGEX = re.compile(r'(?:[Nn][Ii][Dd]\s?:?\s*)([0-9]+)') +ONLY_DOI_REGEX = re.compile(r'^(10.\d{4,9})\s?/\s?([-._;()<>/:A-Za-z0-9]+[^.?\s])$') PUBMED_ID_REGEX = re.compile(r'(?:(?:https?://)?(?:www.)?ncbi.nlm.nih.gov/pubmed/|[Pp][Mm][Ii][Dd]\s?:?\s*)([0-9]+)')