Add option --parse-metadata

* The fields extracted by this can be used in `--output` * Deprecated `--metadata-from-title` :ci skip dl
2025-02-18 15:36:48 +01:00 · 2021-01-26 15:50:20 +05:30 · 2021-01-26 15:50:20 +05:30 · 5bfa486205
commit 5bfa486205
parent 9882064024
8 changed files with 162 additions and 110 deletions
--- a/README.md
+++ b/README.md
@ -610,16 +610,19 @@ Then simply type this
    --no-embed-thumbnail             Do not embed thumbnail (default)
    --add-metadata                   Write metadata to the video file
    --no-add-metadata                Do not write metadata (default)
-    --metadata-from-title FORMAT     Parse additional metadata like song title /
-                                     artist from the video title. The format
-                                     syntax is the same as --output. Regular
-                                     expression with named capture groups may
-                                     also be used. The parsed parameters replace
-                                     existing values. Example: --metadata-from-
-                                     title "%(artist)s - %(title)s" matches a
+    --parse-metadata FIELD:FORMAT    Parse additional metadata like title/artist
+                                     from other fields. Give field name to
+                                     extract data from, and format of the field
+                                     seperated by a ":". The format syntax is
+                                     the same as --output. Regular expression
+                                     with named capture groups may also be used.
+                                     The parsed parameters replace existing
+                                     values. This option can be used multiple
+                                     times. Example: --parse-metadata
+                                     "title:%(artist)s - %(title)s" matches a
                                     title like "Coldplay - Paradise". Example
-                                     (regex): --metadata-from-title
-                                     "(?P<artist>.+?) - (?P<title>.+)"
+                                     (regex): --parse-metadata
+                                     "description:Artist - (?P<artist>.+?)"
    --xattrs                         Write metadata to the video file's xattrs
                                     (using dublin core and xdg standards)
    --fixup POLICY                   Automatically correct known faults of the
@ -1098,7 +1101,7 @@ $ youtube-dlc -S '+res:480,codec,br'

 Plugins are loaded from `<root-dir>/ytdlp_plugins/<type>/__init__.py`. Currently only `extractor` plugins are supported. Support for `downloader` and `postprocessor` plugins may be added in the future. See [ytdlp_plugins](ytdlp_plugins) for example.

-**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code ((`<root dir>/youtube_dlc/__main__.py`)
+**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code (`<root dir>/youtube_dlc/__main__.py`)

 # MORE
-For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl)
+For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl#faq)
--- a/test/test_postprocessors.py
+++ b/test/test_postprocessors.py
@ -8,10 +8,16 @@ import sys
 import unittest
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

-from youtube_dlc.postprocessor import MetadataFromTitlePP
+from youtube_dlc.postprocessor import MetadataFromFieldPP, MetadataFromTitlePP
+
+
+class TestMetadataFromField(unittest.TestCase):
+    def test_format_to_regex(self):
+        pp = MetadataFromFieldPP(None, ['title:%(title)s - %(artist)s'])
+        self.assertEqual(pp._data[0]['regex'], r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)')


 class TestMetadataFromTitle(unittest.TestCase):
    def test_format_to_regex(self):
        pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s')
-        self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)')
+        self.assertEqual(pp._titleregex, r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)')
--- a/youtube_dlc/YoutubeDL.py
+++ b/youtube_dlc/YoutubeDL.py
@ -375,8 +375,7 @@ class YoutubeDL(object):

    params = None
    _ies = []
-    _pps = []
-    _pps_end = []
+    _pps = {'beforedl': [], 'aftermove': [], 'normal': []}
    __prepare_filename_warned = False
    _download_retcode = None
    _num_downloads = None
@ -390,8 +389,7 @@ class YoutubeDL(object):
            params = {}
        self._ies = []
        self._ies_instances = {}
-        self._pps = []
-        self._pps_end = []
+        self._pps = {'beforedl': [], 'aftermove': [], 'normal': []}
        self.__prepare_filename_warned = False
        self._post_hooks = []
        self._progress_hooks = []
@ -494,11 +492,13 @@ class YoutubeDL(object):
            pp_class = get_postprocessor(pp_def_raw['key'])
            pp_def = dict(pp_def_raw)
            del pp_def['key']
-            after_move = pp_def.get('_after_move', False)
-            if '_after_move' in pp_def:
-                del pp_def['_after_move']
+            if 'when' in pp_def:
+                when = pp_def['when']
+                del pp_def['when']
+            else:
+                when = 'normal'
            pp = pp_class(self, **compat_kwargs(pp_def))
-            self.add_post_processor(pp, after_move=after_move)
+            self.add_post_processor(pp, when=when)

        for ph in self.params.get('post_hooks', []):
            self.add_post_hook(ph)
@ -550,12 +550,9 @@ class YoutubeDL(object):
        for ie in gen_extractor_classes():
            self.add_info_extractor(ie)

-    def add_post_processor(self, pp, after_move=False):
+    def add_post_processor(self, pp, when='normal'):
        """Add a PostProcessor object to the end of the chain."""
-        if after_move:
-            self._pps_end.append(pp)
-        else:
-            self._pps.append(pp)
+        self._pps[when].append(pp)
        pp.set_downloader(self)

    def add_post_hook(self, ph):
@ -1948,6 +1945,8 @@ class YoutubeDL(object):

        self._num_downloads += 1

+        info_dict = self.pre_process(info_dict)
+
        filename = self.prepare_filename(info_dict, warn=True)
        info_dict['_filename'] = full_filename = self.prepare_filepath(filename)
        temp_filename = self.prepare_filepath(filename, 'temp')
@ -2400,20 +2399,14 @@ class YoutubeDL(object):
            (k, v) for k, v in info_dict.items()
            if k not in ['requested_formats', 'requested_subtitles'])

-    def post_process(self, filename, ie_info, files_to_move={}):
-        """Run all the postprocessors on the given file."""
-        info = dict(ie_info)
-        info['filepath'] = filename
-
-        def run_pp(pp):
+    def run_pp(self, pp, infodict, files_to_move={}):
        files_to_delete = []
-            infodict = info
        try:
            files_to_delete, infodict = pp.run(infodict)
        except PostProcessingError as e:
            self.report_error(e.msg)
        if not files_to_delete:
-                return infodict
+            return files_to_move, infodict

        if self.params.get('keepvideo', False):
            for f in files_to_delete:
@ -2427,14 +2420,24 @@ class YoutubeDL(object):
                    self.report_warning('Unable to remove downloaded original file')
                if old_filename in files_to_move:
                    del files_to_move[old_filename]
-            return infodict
+        return files_to_move, infodict

-        for pp in ie_info.get('__postprocessors', []) + self._pps:
-            info = run_pp(pp)
-        info = run_pp(MoveFilesAfterDownloadPP(self, files_to_move))
-        files_to_move = {}
-        for pp in self._pps_end:
-            info = run_pp(pp)
+    def pre_process(self, ie_info):
+        info = dict(ie_info)
+        for pp in self._pps['beforedl']:
+            info = self.run_pp(pp, info)[1]
+        return info
+
+    def post_process(self, filename, ie_info, files_to_move={}):
+        """Run all the postprocessors on the given file."""
+        info = dict(ie_info)
+        info['filepath'] = filename
+
+        for pp in ie_info.get('__postprocessors', []) + self._pps['normal']:
+            files_to_move, info = self.run_pp(pp, info, files_to_move)
+        info = self.run_pp(MoveFilesAfterDownloadPP(self, files_to_move), info, files_to_move)[1]
+        for pp in self._pps['aftermove']:
+            files_to_move, info = self.run_pp(pp, info, {})

    def _make_archive_id(self, info_dict):
        video_id = info_dict.get('id')
--- a/youtube_dlc/init.py
+++ b/youtube_dlc/init.py
@ -45,6 +45,7 @@ from .downloader import (
 from .extractor import gen_extractors, list_extractors
 from .extractor.common import InfoExtractor
 from .extractor.adobepass import MSO_INFO
+from .postprocessor.metadatafromfield import MetadataFromFieldPP
 from .YoutubeDL import YoutubeDL


@ -249,16 +250,25 @@ def _real_main(argv=None):
        if re.match(InfoExtractor.FormatSort.regex, f) is None:
            parser.error('invalid format sort string "%s" specified' % f)

+    if opts.metafromfield is None:
+        opts.metafromfield = []
+    if opts.metafromtitle is not None:
+        opts.metafromfield.append('title:%s' % opts.metafromtitle)
+    for f in opts.metafromfield:
+        if re.match(MetadataFromFieldPP.regex, f) is None:
+            parser.error('invalid format string "%s" specified for --parse-metadata' % f)
+
    any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
    any_printing = opts.print_json
    download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive

    # PostProcessors
    postprocessors = []
-    if opts.metafromtitle:
+    if opts.metafromfield:
        postprocessors.append({
-            'key': 'MetadataFromTitle',
-            'titleformat': opts.metafromtitle
+            'key': 'MetadataFromField',
+            'formats': opts.metafromfield,
+            'when': 'beforedl'
        })
    if opts.extractaudio:
        postprocessors.append({
@ -324,7 +334,7 @@ def _real_main(argv=None):
        postprocessors.append({
            'key': 'ExecAfterDownload',
            'exec_cmd': opts.exec_cmd,
-            '_after_move': True
+            'when': 'aftermove'
        })

    _args_compat_warning = 'WARNING: %s given without specifying name. The arguments will be given to all %s\n'
--- a/youtube_dlc/options.py
+++ b/youtube_dlc/options.py
@ -1078,14 +1078,20 @@ def parseOpts(overrideArguments=None):
    postproc.add_option(
        '--metadata-from-title',
        metavar='FORMAT', dest='metafromtitle',
+        help=optparse.SUPPRESS_HELP)
+    postproc.add_option(
+        '--parse-metadata',
+        metavar='FIELD:FORMAT', dest='metafromfield', action='append',
        help=(
-            'Parse additional metadata like song title / artist from the video title. '
-            'The format syntax is the same as --output. Regular expression with '
-            'named capture groups may also be used. '
+            'Parse additional metadata like title/artist from other fields. '
+            'Give field name to extract data from, and format of the field seperated by a ":". '
+            'The format syntax is the same as --output. '
+            'Regular expression with named capture groups may also be used. '
            'The parsed parameters replace existing values. '
-            'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
+            'This option can be used multiple times. '
+            'Example: --parse-metadata "title:%(artist)s - %(title)s" matches a title like '
            '"Coldplay - Paradise". '
-            'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"'))
+            'Example (regex): --parse-metadata "description:Artist - (?P<artist>.+?)"'))
    postproc.add_option(
        '--xattrs',
        action='store_true', dest='xattrs', default=False,
--- a/youtube_dlc/postprocessor/init.py
+++ b/youtube_dlc/postprocessor/init.py
@ -16,7 +16,8 @@ from .ffmpeg import (
 )
 from .xattrpp import XAttrMetadataPP
 from .execafterdownload import ExecAfterDownloadPP
-from .metadatafromtitle import MetadataFromTitlePP
+from .metadatafromfield import MetadataFromFieldPP
+from .metadatafromfield import MetadataFromTitlePP
 from .movefilesafterdownload import MoveFilesAfterDownloadPP
 from .sponskrub import SponSkrubPP

@ -39,6 +40,7 @@ __all__ = [
    'FFmpegSubtitlesConvertorPP',
    'FFmpegVideoConvertorPP',
    'FFmpegVideoRemuxerPP',
+    'MetadataFromFieldPP',
    'MetadataFromTitlePP',
    'MoveFilesAfterDownloadPP',
    'SponSkrubPP',
--- a/youtube_dlc/postprocessor/metadatafromfield.py
+++ b/youtube_dlc/postprocessor/metadatafromfield.py
@ -0,0 +1,66 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import PostProcessor
+from ..compat import compat_str
+
+
+class MetadataFromFieldPP(PostProcessor):
+    regex = r'(?P<field>\w+):(?P<format>.+)$'
+
+    def __init__(self, downloader, formats):
+        PostProcessor.__init__(self, downloader)
+        assert isinstance(formats, (list, tuple))
+        self._data = []
+        for f in formats:
+            assert isinstance(f, compat_str)
+            match = re.match(self.regex, f)
+            assert match is not None
+            self._data.append({
+                'field': match.group('field'),
+                'format': match.group('format'),
+                'regex': self.format_to_regex(match.group('format'))})
+
+    def format_to_regex(self, fmt):
+        r"""
+        Converts a string like
+           '%(title)s - %(artist)s'
+        to a regex like
+           '(?P<title>.+)\ \-\ (?P<artist>.+)'
+        """
+        if not re.search(r'%\(\w+\)s', fmt):
+            return fmt
+        lastpos = 0
+        regex = ''
+        # replace %(..)s with regex group and escape other string parts
+        for match in re.finditer(r'%\((\w+)\)s', fmt):
+            regex += re.escape(fmt[lastpos:match.start()])
+            regex += r'(?P<' + match.group(1) + r'>[^\r\n]+)'
+            lastpos = match.end()
+        if lastpos < len(fmt):
+            regex += re.escape(fmt[lastpos:])
+        return regex
+
+    def run(self, info):
+        for dictn in self._data:
+            field, regex = dictn['field'], dictn['regex']
+            if field not in info:
+                self.report_warning('Video doesnot have a %s' % field)
+                continue
+            self.write_debug('Searching for r"%s" in %s' % (regex, field))
+            match = re.search(regex, info[field])
+            if match is None:
+                self.report_warning('Could not interpret video %s as "%s"' % (field, dictn['format']))
+                continue
+            for attribute, value in match.groupdict().items():
+                info[attribute] = value
+                self.to_screen('parsed %s from %s: %s' % (attribute, field, value if value is not None else 'NA'))
+        return [], info
+
+
+class MetadataFromTitlePP(MetadataFromFieldPP):  # for backward compatibility
+    def __init__(self, downloader, titleformat):
+        super(MetadataFromTitlePP, self).__init__(downloader, ['title:%s' % titleformat])
+        self._titleformat = titleformat
+        self._titleregex = self._data[0]['regex']
--- a/youtube_dlc/postprocessor/metadatafromtitle.py
+++ b/youtube_dlc/postprocessor/metadatafromtitle.py
@ -1,44 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import PostProcessor
-
-
-class MetadataFromTitlePP(PostProcessor):
-    def __init__(self, downloader, titleformat):
-        super(MetadataFromTitlePP, self).__init__(downloader)
-        self._titleformat = titleformat
-        self._titleregex = (self.format_to_regex(titleformat)
-                            if re.search(r'%\(\w+\)s', titleformat)
-                            else titleformat)
-
-    def format_to_regex(self, fmt):
-        r"""
-        Converts a string like
-           '%(title)s - %(artist)s'
-        to a regex like
-           '(?P<title>.+)\ \-\ (?P<artist>.+)'
-        """
-        lastpos = 0
-        regex = ''
-        # replace %(..)s with regex group and escape other string parts
-        for match in re.finditer(r'%\((\w+)\)s', fmt):
-            regex += re.escape(fmt[lastpos:match.start()])
-            regex += r'(?P<' + match.group(1) + '>.+)'
-            lastpos = match.end()
-        if lastpos < len(fmt):
-            regex += re.escape(fmt[lastpos:])
-        return regex
-
-    def run(self, info):
-        title = info['title']
-        match = re.match(self._titleregex, title)
-        if match is None:
-            self.to_screen('Could not interpret title of video as "%s"' % self._titleformat)
-            return [], info
-        for attribute, value in match.groupdict().items():
-            info[attribute] = value
-            self.to_screen('parsed %s: %s' % (attribute, value if value is not None else 'NA'))
-
-        return [], info