hyperboria/nexus/pylon/pdftools/watermarks.py

import binascii
import logging
import re
from typing import (
    Any,
    Dict,
    List,
    Tuple,
    Union,
    cast,
)

from PyPDF2._cmap import (
    _default_fonts_space_width,
    compute_space_width,
    parse_bfchar,
    parse_encoding,
    prepare_cm,
    unknown_char_map,
)
from PyPDF2.errors import PdfStreamError
from PyPDF2.generic import (
    ContentStream,
    DictionaryObject,
    NameObject,
)

from .text_collector import TextCollector

elsevier_regexp = re.compile(
    r'Downloaded for .* at .* from .* '
    r'by .* on \w+ \d{1,2}, \d{4}. '
    r'For personal use only. No other uses without permission. '
    r'Copyright ©\d{4}. Elsevier Inc. All rights reserved.'
)
bmj_regexp = re.compile(
    r'^.*: first published as .* on \d{1,2} \w+ \d{4}. Downloaded from .*'
    r' on \w+ \d{1,2}, \d{4} at .* Protected by\s*copyright.'
)
downloaded_regexp = re.compile(rb'^[Dd]ownloaded [Ff]rom:? https?://')
terms_of_use_regexp = re.compile(rb'^[Tt]erms [Oo]f [Uu]se:? https?://')


def _is_downloaded_from_https_watermark(text):
    return bool(re.search(downloaded_regexp, text)) or bool(re.search(terms_of_use_regexp, text))


def _is_1021_watermark(text):
    return (
        text.startswith(b'Downloaded via ')
        or text.startswith(
            b'See https://pubs.acs.org/sharingguidelines for options '
            b'on how to legitimately share published articles.'
        )
    )


def _is_1093_watermark(text):
    return bool(re.search(rb'^Downloaded from https://(.*) by [\w\s]+ on \d{1,2} \w+ \d{4}$', text))


def process_cm_line(
    l: bytes,
    process_rg: bool,
    process_char: bool,
    multiline_rg: Union[None, Tuple[int, int]],
    map_dict,
    int_entry,
):
    if l in (b"", b" ") or l[0] == 37:  # 37 = %
        return process_rg, process_char, multiline_rg
    if b"beginbfrange" in l:
        process_rg = True
    elif b"endbfrange" in l:
        process_rg = False
    elif b"beginbfchar" in l:
        process_char = True
    elif b"endbfchar" in l:
        process_char = False
    elif process_rg:
        multiline_rg = parse_bfrange(l, map_dict, int_entry, multiline_rg)
    elif process_char:
        parse_bfchar(l, map_dict, int_entry)
    return process_rg, process_char, multiline_rg


def parse_bfrange(
    l: bytes,
    map_dict: Dict[Any, Any],
    int_entry: List[int],
    multiline_rg: Union[None, Tuple[int, int]],
) -> Union[None, Tuple[int, int]]:
    lst = [x for x in l.split(b" ") if x]
    closure_found = False
    nbi = len(lst[0])
    map_dict[-1] = nbi // 2
    fmt = b"%%0%dX" % nbi
    if multiline_rg is not None:
        a = multiline_rg[0]  # a, b not in the current line
        b = multiline_rg[1]
        for sq in lst[1:]:
            if sq == b"]":
                closure_found = True
                break
            map_dict[
                binascii.unhexlify(fmt % a).decode(
                    "charmap" if map_dict[-1] == 1 else "utf-16-be",
                    "surrogatepass",
                )
            ] = binascii.unhexlify(sq).decode("utf-16-be", "surrogatepass")
            int_entry.append(a)
            a += 1
    else:
        a = int(lst[0], 16)
        b = int(lst[1], 16)
        if lst[2] == b"[":
            for sq in lst[3:]:
                if sq == b"]":
                    closure_found = True
                    break
                map_dict[
                    binascii.unhexlify(fmt % a).decode(
                        "charmap" if map_dict[-1] == 1 else "utf-16-be",
                        "surrogatepass",
                    )
                ] = binascii.unhexlify(sq).decode("utf-16-be", "surrogatepass")
                int_entry.append(a)
                a += 1
        else:  # case without list
            c = int(lst[2], 16)
            fmt2 = b"%%0%dX" % max(4, len(lst[2]))
            closure_found = True
            while a <= b:
                map_dict[
                    binascii.unhexlify(fmt % a).decode(
                        "charmap" if map_dict[-1] == 1 else "utf-16-be",
                        "surrogatepass",
                    )
                ] = binascii.unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
                int_entry.append(a)
                a += 1
                c += 1
    return None if closure_found else (a, b)


def parse_to_unicode(ft: DictionaryObject, space_code: int):
    # will store all translation code
    # and map_dict[-1] we will have the number of bytes to convert
    map_dict = {}

    # will provide the list of cmap keys as int to correct encoding
    int_entry = []

    if "/ToUnicode" not in ft:
        return {}, space_code, []
    process_rg: bool = False
    process_char: bool = False
    multiline_rg: Union[
        None, Tuple[int, int]
    ] = None
    cm = prepare_cm(ft)
    for l in cm.split(b"\n"):
        process_rg, process_char, multiline_rg = process_cm_line(
            l.strip(b" "), process_rg, process_char, multiline_rg, map_dict, int_entry
        )

    for a, value in map_dict.items():
        if value == " ":
            space_code = a
    return map_dict, space_code, int_entry


def build_char_map(
    font_name: str, space_width: float, obj: DictionaryObject
):  # font_type,space_width /2, encoding, cmap
    ft: DictionaryObject = obj["/Resources"]["/Font"][font_name]  # type: ignore
    font_type: str = cast(str, ft["/Subtype"])

    space_code = 32
    encoding, space_code = parse_encoding(ft, space_code)
    map_dict, space_code, int_entry = parse_to_unicode(ft, space_code)

    if encoding == "":
        if -1 not in map_dict or map_dict[-1] == 1:
            encoding = "charmap"
        else:
            encoding = "utf-16-be"
    elif isinstance(encoding, dict):
        for x in int_entry:
            if x <= 255:
                encoding[x] = chr(x)
    try:
        # override space_width with new params
        space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])]
    except Exception:
        pass
    # I conside the space_code is available on one byte
    if isinstance(space_code, str):
        try:  # one byte
            sp = space_code.encode("charmap")[0]
        except Exception:
            sp = space_code.encode("utf-16-be")
            sp = sp[0] + 256 * sp[1]
    else:
        sp = space_code
    sp_width = compute_space_width(ft, sp, space_width)

    return (
        font_type,
        float(sp_width / 2),
        encoding,
        # https://github.com/python/mypy/issues/4374
        map_dict,
    )


class BasePdfProcessor:
    def __init__(self, remove_pages=None):
        self.remove_pages = remove_pages or tuple()

    def process_page(self, page, pdf_reader):
        return page

    def process(self, pdf_reader, pdf_writer):
        for page_num, page in enumerate(pdf_reader.pages):
            if page_num in self.remove_pages:
                continue
            try:
                page = self.process_page(page, pdf_reader)
            except (PdfStreamError, binascii.Error) as e:
                logging.getLogger('warning').warning({
                    'action': 'pdf_stream_error',
                    'mode': 'pylon',
                    'error': str(e),
                })
            pdf_writer.add_page(page)


class BaseWatermarkEraser(BasePdfProcessor):
    def __init__(self, is_watermark_predicate=_is_downloaded_from_https_watermark, watermark_orientations=None, remove_pages=None):
        super().__init__(remove_pages=remove_pages)
        self.is_watermark_predicate = is_watermark_predicate
        self.watermark_orientations = watermark_orientations if watermark_orientations is not None else (0, 90, 180, 270)


class WatermarkEraser1(BaseWatermarkEraser):
    def process_page(self, page, pdf_reader):
        if '/XObject' in page['/Resources']:
            xobj = page['/Resources']['/XObject']
            content = ContentStream(page['/Contents'], pdf_reader, "bytes")

            xobj_death_note = []
            operations_death_note = []
            for op_i, (operands, operation) in enumerate(content.operations):
                if operation == b"Do":
                    nested_op = xobj[operands[0]]
                    if nested_op["/Subtype"] != "/Image":
                        text = page.extract_xform_text(nested_op, self.watermark_orientations, 200.0)  # type: ignore
                        if self.is_watermark_predicate(text.encode()):
                            xobj_death_note.append(operands[0])
                            operations_death_note.append(op_i)
                            logging.getLogger('debug').debug({
                                'action': 'watermark_removal',
                                'mode': 'pylon',
                                'text': text,
                            })

            # Erase dictionary objects with watermarks
            for op_i in sorted(xobj_death_note, reverse=True):
                del xobj[op_i]

            # Erase operations with watermarks
            for op_i in reversed(operations_death_note):
                del content.operations[op_i]

            if operations_death_note or xobj_death_note:
                page.__setitem__(NameObject('/Contents'), content)
                page.compress_content_streams()

        return page


class WatermarkEraser2(BaseWatermarkEraser):
    def process_page(self, page, pdf_reader):
        content = ContentStream(page['/Contents'], pdf_reader, "bytes")
        operations_death_note = []

        for op_i, (operands, operation) in enumerate(content.operations):
            if operation == b"Tj":
                if isinstance(operands[0], bytes) and self.is_watermark_predicate(operands[0]):
                    operations_death_note.append(op_i)
                    logging.getLogger('debug').debug({
                        'action': 'watermark_removal',
                        'mode': 'pylon',
                        'text': operands[0].decode(),
                    })

        # Erase operations with watermarks
        for op_i in reversed(operations_death_note):
            del content.operations[op_i]

        if operations_death_note:
            page.__setitem__(NameObject('/Contents'), content)
            page.compress_content_streams()

        return page


class WatermarkEraser3(BaseWatermarkEraser):
    def process_page(self, page, pdf_reader):
        content = ContentStream(page['/Contents'], pdf_reader, "bytes")
        operations_death_note = []

        for op_i, (operands, operation) in enumerate(content.operations):
            if operation == b"TJ":
                text = b''
                for operand in operands[0]:
                    if isinstance(operand, bytes):
                        text += operand
                if self.is_watermark_predicate(text):
                    operations_death_note.append(op_i)
                    logging.getLogger('debug').debug({
                        'action': 'watermark_removal',
                        'mode': 'pylon',
                        'text': text.decode(),
                    })

        # Erase operations with watermarks
        for op_i in reversed(operations_death_note):
            del content.operations[op_i]

        if operations_death_note:
            page.__setitem__(NameObject('/Contents'), content)
            page.compress_content_streams()

        return page


class WatermarkEraser4(BaseWatermarkEraser):
    def __init__(self, regexp, inverted=False):
        super().__init__()
        self.regexp = regexp
        self.inverted = inverted

    def process_page(self, page, pdf_reader):
        content = ContentStream(page['/Contents'], pdf_reader, "bytes")
        operations_death_note = []

        cmaps = {}
        space_width = 200.0
        resources_dict = cast(DictionaryObject, page['/Resources'])
        tc = TextCollector(self.inverted)

        if "/Font" in resources_dict:
            for f in cast(DictionaryObject, resources_dict["/Font"]):
                cmaps[f] = build_char_map(f, space_width, page)

        cm_stack = []
        cmap = ("charmap", {}, "NotInitialized")

        for op_i, (operands, operation) in enumerate(content.operations):
            if operation == b"q":
                cm_stack.append(cmap)
            elif operation == b"Q":
                try:
                    cmap = cm_stack.pop()
                except Exception:
                    pass
            elif operation == b"Tf":
                try:
                    _space_width = cmaps[operands[0]][1]
                    cmap = (
                        cmaps[operands[0]][2],
                        cmaps[operands[0]][3],
                        operands[0],
                    )
                except KeyError:  # font not found
                    _space_width = unknown_char_map[1]
                    cmap = (
                        unknown_char_map[2],
                        unknown_char_map[3],
                        "???" + operands[0],
                    )
            elif operation == b"Tj":
                if isinstance(operands[0], str):
                    text = operands[0]
                else:
                    if isinstance(cmap[0], str):
                        try:
                            t = operands[0].decode(cmap[0], "surrogatepass")
                        except Exception:
                            t = operands[0].decode("utf-16-be" if cmap[0] == "charmap" else "charmap", "surrogatepass")
                    else:
                        t = "".join(
                            [
                                cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
                                for x in operands[0]
                            ]
                        )
                    text = "".join([cmap[1][x] if x in cmap[1] else x for x in t])
                tc.add_piece(text, op_i)
                text, matched = tc.match(self.regexp)
                if matched:
                    operations_death_note.extend(matched)
                    logging.getLogger('debug').debug({
                        'action': 'watermark_removal',
                        'mode': 'pylon',
                        'matched': text,
                    })
                    tc.clear()

        # Erase operations with watermarks
        for op_i in reversed(operations_death_note):
            del content.operations[op_i]

        if operations_death_note:
            page.__setitem__(NameObject('/Contents'), content)
            page.compress_content_streams()

        return page


pdf_processors = {
    '10.1001': WatermarkEraser1(watermark_orientations=(0,)),
    '10.1016': WatermarkEraser4(elsevier_regexp),
    '10.1021': WatermarkEraser1(is_watermark_predicate=_is_1021_watermark, watermark_orientations=(90,)),
    '10.1073': WatermarkEraser1(watermark_orientations=(90,)),
    '10.1088': WatermarkEraser1(is_watermark_predicate=lambda text: False, remove_pages=(0,)),
    '10.1093': WatermarkEraser2(is_watermark_predicate=_is_1093_watermark),
    '10.1126': WatermarkEraser1(watermark_orientations=(270,)),
    '10.1136': WatermarkEraser4(bmj_regexp, inverted=True),
    '10.1287': WatermarkEraser1(
        watermark_orientations=(90,),
        remove_pages=(0,),
    ),
    '10.2108': WatermarkEraser3(),
}

base_pdf_processor = BasePdfProcessor()