mirror of
https://github.com/nexus-stc/hyperboria
synced 2025-02-14 02:56:48 +01:00
- [nexus] Remove outdated protos - [nexus] Development - [nexus] Development - [nexus] Development - [nexus] Development - [nexus] Development - [nexus] Refactor views - [nexus] Update aiosumma - [nexus] Add tags - [nexus] Development - [nexus] Update repository - [nexus] Update repository - [nexus] Update dependencies - [nexus] Update dependencies - [nexus] Fixes for MetaAPI - [nexus] Support for new queries - [nexus] Adopt new versions of search - [nexus] Improving Nexus - [nexus] Various fixes - [nexus] Add profile - [nexus] Fixes for ingestion - [nexus] Refactorings and bugfixes - [idm] Add profile methods - [nexus] Fix stalled nexus-meta bugs - [nexus] Various bugfixes - [nexus] Restore IDM API functionality GitOrigin-RevId: a0842345a6dde5b321279ab5510a50c0def0e71a
35 lines
807 B
Python
35 lines
807 B
Python
import io
|
|
|
|
from PyPDF2 import (
|
|
PdfReader,
|
|
PdfWriter,
|
|
)
|
|
from PyPDF2.generic import DictionaryObject
|
|
|
|
from .watermarks import (
|
|
base_pdf_processor,
|
|
pdf_processors,
|
|
)
|
|
|
|
|
|
def is_pdf(f):
|
|
return len(f) > 4 and f[:4] == b'%PDF'
|
|
|
|
|
|
def clean_metadata(pdf, doi=None):
|
|
reader = PdfReader(io.BytesIO(pdf), strict=False)
|
|
writer = PdfWriter()
|
|
writer._objects = writer._objects[:-1]
|
|
writer._info = writer._add_object(DictionaryObject())
|
|
pdf_processor = base_pdf_processor
|
|
if doi:
|
|
doi_prefix = doi.split('/')[0]
|
|
if doi_prefix in pdf_processors:
|
|
pdf_processor = pdf_processors[doi_prefix]
|
|
pdf_processor.process(reader, writer)
|
|
buffer = io.BytesIO()
|
|
writer.write_stream(buffer)
|
|
buffer.flush()
|
|
buffer.seek(0)
|
|
return buffer.read()
|