- fix(nexus): Fix DOI detection in messages

GitOrigin-RevId: e4d4519221319c26134d8cbbd011eca2def2b6b3
This commit is contained in:
the-superpirate 2021-05-02 10:45:22 +03:00
parent 00c6c6ffff
commit 6b69d092fa
3 changed files with 7 additions and 5 deletions

View File

@ -1,5 +1,5 @@
# Nexus Search: Meta API
```
NEXUS_META_API_summa.url=http://summa bazel run -c opt binary
NEXUS_META_API_summa.url=http://summa bazel run binary
```

View File

@ -5,6 +5,7 @@ from nexus.nlptools.regex import (
DOI_REGEX,
ISBN_REGEX,
NID_REGEX,
ONLY_DOI_REGEX,
URL_REGEX,
)
@ -20,10 +21,10 @@ class QueryClass(Enum):
def check_doi(query) -> (QueryClass, str):
# ToDo: rewrite normally, just hotfixed
if query.startswith('references:'):
return
if r := re.search(DOI_REGEX, query):
if (
((r := re.search(DOI_REGEX, query)) and re.search(URL_REGEX, query))
or re.search(ONLY_DOI_REGEX, query)
):
doi = (r[1] + '/' + r[2]).lower()
return {
'doi': doi,

View File

@ -29,4 +29,5 @@ DOI_REGEX = re.compile(r'(10.\d{4,9})\s?/\s?([-._;()<>/:A-Za-z0-9]+[^.?\s])')
ISBN_REGEX = re.compile(r'^(?:[iI][sS][bB][nN]\:?\s*)?((97(8|9))?\-?\d{9}(\d|X))$')
MD5_REGEX = re.compile(r'([A-Fa-f0-9]{32})')
NID_REGEX = re.compile(r'(?:[Nn][Ii][Dd]\s?:?\s*)([0-9]+)')
ONLY_DOI_REGEX = re.compile(r'^(10.\d{4,9})\s?/\s?([-._;()<>/:A-Za-z0-9]+[^.?\s])$')
PUBMED_ID_REGEX = re.compile(r'(?:(?:https?://)?(?:www.)?ncbi.nlm.nih.gov/pubmed/|[Pp][Mm][Ii][Dd]\s?:?\s*)([0-9]+)')