Add option --parse-metadata

* The fields extracted by this can be used in `--output`
* Deprecated `--metadata-from-title`

:ci skip dl
This commit is contained in:
pukkandan 2021-01-26 15:50:20 +05:30
parent 9882064024
commit 5bfa486205
8 changed files with 162 additions and 110 deletions

View File

@ -610,16 +610,19 @@ ## Post-Processing Options:
--no-embed-thumbnail Do not embed thumbnail (default)
--add-metadata Write metadata to the video file
--no-add-metadata Do not write metadata (default)
--metadata-from-title FORMAT Parse additional metadata like song title /
artist from the video title. The format
syntax is the same as --output. Regular
expression with named capture groups may
also be used. The parsed parameters replace
existing values. Example: --metadata-from-
title "%(artist)s - %(title)s" matches a
--parse-metadata FIELD:FORMAT Parse additional metadata like title/artist
from other fields. Give field name to
extract data from, and format of the field
seperated by a ":". The format syntax is
the same as --output. Regular expression
with named capture groups may also be used.
The parsed parameters replace existing
values. This option can be used multiple
times. Example: --parse-metadata
"title:%(artist)s - %(title)s" matches a
title like "Coldplay - Paradise". Example
(regex): --metadata-from-title
"(?P<artist>.+?) - (?P<title>.+)"
(regex): --parse-metadata
"description:Artist - (?P<artist>.+?)"
--xattrs Write metadata to the video file's xattrs
(using dublin core and xdg standards)
--fixup POLICY Automatically correct known faults of the
@ -1098,7 +1101,7 @@ # PLUGINS
Plugins are loaded from `<root-dir>/ytdlp_plugins/<type>/__init__.py`. Currently only `extractor` plugins are supported. Support for `downloader` and `postprocessor` plugins may be added in the future. See [ytdlp_plugins](ytdlp_plugins) for example.
**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code ((`<root dir>/youtube_dlc/__main__.py`)
**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code (`<root dir>/youtube_dlc/__main__.py`)
# MORE
For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl)
For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl#faq)

View File

@ -8,10 +8,16 @@
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dlc.postprocessor import MetadataFromTitlePP
from youtube_dlc.postprocessor import MetadataFromFieldPP, MetadataFromTitlePP
class TestMetadataFromField(unittest.TestCase):
def test_format_to_regex(self):
pp = MetadataFromFieldPP(None, ['title:%(title)s - %(artist)s'])
self.assertEqual(pp._data[0]['regex'], r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)')
class TestMetadataFromTitle(unittest.TestCase):
def test_format_to_regex(self):
pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s')
self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)')
self.assertEqual(pp._titleregex, r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)')

View File

@ -375,8 +375,7 @@ class YoutubeDL(object):
params = None
_ies = []
_pps = []
_pps_end = []
_pps = {'beforedl': [], 'aftermove': [], 'normal': []}
__prepare_filename_warned = False
_download_retcode = None
_num_downloads = None
@ -390,8 +389,7 @@ def __init__(self, params=None, auto_init=True):
params = {}
self._ies = []
self._ies_instances = {}
self._pps = []
self._pps_end = []
self._pps = {'beforedl': [], 'aftermove': [], 'normal': []}
self.__prepare_filename_warned = False
self._post_hooks = []
self._progress_hooks = []
@ -494,11 +492,13 @@ def check_deprecated(param, option, suggestion):
pp_class = get_postprocessor(pp_def_raw['key'])
pp_def = dict(pp_def_raw)
del pp_def['key']
after_move = pp_def.get('_after_move', False)
if '_after_move' in pp_def:
del pp_def['_after_move']
if 'when' in pp_def:
when = pp_def['when']
del pp_def['when']
else:
when = 'normal'
pp = pp_class(self, **compat_kwargs(pp_def))
self.add_post_processor(pp, after_move=after_move)
self.add_post_processor(pp, when=when)
for ph in self.params.get('post_hooks', []):
self.add_post_hook(ph)
@ -550,12 +550,9 @@ def add_default_info_extractors(self):
for ie in gen_extractor_classes():
self.add_info_extractor(ie)
def add_post_processor(self, pp, after_move=False):
def add_post_processor(self, pp, when='normal'):
"""Add a PostProcessor object to the end of the chain."""
if after_move:
self._pps_end.append(pp)
else:
self._pps.append(pp)
self._pps[when].append(pp)
pp.set_downloader(self)
def add_post_hook(self, ph):
@ -1948,6 +1945,8 @@ def process_info(self, info_dict):
self._num_downloads += 1
info_dict = self.pre_process(info_dict)
filename = self.prepare_filename(info_dict, warn=True)
info_dict['_filename'] = full_filename = self.prepare_filepath(filename)
temp_filename = self.prepare_filepath(filename, 'temp')
@ -2400,41 +2399,45 @@ def filter_requested_info(info_dict):
(k, v) for k, v in info_dict.items()
if k not in ['requested_formats', 'requested_subtitles'])
def run_pp(self, pp, infodict, files_to_move={}):
files_to_delete = []
try:
files_to_delete, infodict = pp.run(infodict)
except PostProcessingError as e:
self.report_error(e.msg)
if not files_to_delete:
return files_to_move, infodict
if self.params.get('keepvideo', False):
for f in files_to_delete:
files_to_move.setdefault(f, '')
else:
for old_filename in set(files_to_delete):
self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
try:
os.remove(encodeFilename(old_filename))
except (IOError, OSError):
self.report_warning('Unable to remove downloaded original file')
if old_filename in files_to_move:
del files_to_move[old_filename]
return files_to_move, infodict
def pre_process(self, ie_info):
info = dict(ie_info)
for pp in self._pps['beforedl']:
info = self.run_pp(pp, info)[1]
return info
def post_process(self, filename, ie_info, files_to_move={}):
"""Run all the postprocessors on the given file."""
info = dict(ie_info)
info['filepath'] = filename
def run_pp(pp):
files_to_delete = []
infodict = info
try:
files_to_delete, infodict = pp.run(infodict)
except PostProcessingError as e:
self.report_error(e.msg)
if not files_to_delete:
return infodict
if self.params.get('keepvideo', False):
for f in files_to_delete:
files_to_move.setdefault(f, '')
else:
for old_filename in set(files_to_delete):
self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
try:
os.remove(encodeFilename(old_filename))
except (IOError, OSError):
self.report_warning('Unable to remove downloaded original file')
if old_filename in files_to_move:
del files_to_move[old_filename]
return infodict
for pp in ie_info.get('__postprocessors', []) + self._pps:
info = run_pp(pp)
info = run_pp(MoveFilesAfterDownloadPP(self, files_to_move))
files_to_move = {}
for pp in self._pps_end:
info = run_pp(pp)
for pp in ie_info.get('__postprocessors', []) + self._pps['normal']:
files_to_move, info = self.run_pp(pp, info, files_to_move)
info = self.run_pp(MoveFilesAfterDownloadPP(self, files_to_move), info, files_to_move)[1]
for pp in self._pps['aftermove']:
files_to_move, info = self.run_pp(pp, info, {})
def _make_archive_id(self, info_dict):
video_id = info_dict.get('id')

View File

@ -45,6 +45,7 @@
from .extractor import gen_extractors, list_extractors
from .extractor.common import InfoExtractor
from .extractor.adobepass import MSO_INFO
from .postprocessor.metadatafromfield import MetadataFromFieldPP
from .YoutubeDL import YoutubeDL
@ -249,16 +250,25 @@ def parse_retries(retries):
if re.match(InfoExtractor.FormatSort.regex, f) is None:
parser.error('invalid format sort string "%s" specified' % f)
if opts.metafromfield is None:
opts.metafromfield = []
if opts.metafromtitle is not None:
opts.metafromfield.append('title:%s' % opts.metafromtitle)
for f in opts.metafromfield:
if re.match(MetadataFromFieldPP.regex, f) is None:
parser.error('invalid format string "%s" specified for --parse-metadata' % f)
any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
any_printing = opts.print_json
download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive
# PostProcessors
postprocessors = []
if opts.metafromtitle:
if opts.metafromfield:
postprocessors.append({
'key': 'MetadataFromTitle',
'titleformat': opts.metafromtitle
'key': 'MetadataFromField',
'formats': opts.metafromfield,
'when': 'beforedl'
})
if opts.extractaudio:
postprocessors.append({
@ -324,7 +334,7 @@ def parse_retries(retries):
postprocessors.append({
'key': 'ExecAfterDownload',
'exec_cmd': opts.exec_cmd,
'_after_move': True
'when': 'aftermove'
})
_args_compat_warning = 'WARNING: %s given without specifying name. The arguments will be given to all %s\n'

View File

@ -1078,14 +1078,20 @@ def _dict_from_multiple_values_options_callback(
postproc.add_option(
'--metadata-from-title',
metavar='FORMAT', dest='metafromtitle',
help=optparse.SUPPRESS_HELP)
postproc.add_option(
'--parse-metadata',
metavar='FIELD:FORMAT', dest='metafromfield', action='append',
help=(
'Parse additional metadata like song title / artist from the video title. '
'The format syntax is the same as --output. Regular expression with '
'named capture groups may also be used. '
'Parse additional metadata like title/artist from other fields. '
'Give field name to extract data from, and format of the field seperated by a ":". '
'The format syntax is the same as --output. '
'Regular expression with named capture groups may also be used. '
'The parsed parameters replace existing values. '
'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
'This option can be used multiple times. '
'Example: --parse-metadata "title:%(artist)s - %(title)s" matches a title like '
'"Coldplay - Paradise". '
'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"'))
'Example (regex): --parse-metadata "description:Artist - (?P<artist>.+?)"'))
postproc.add_option(
'--xattrs',
action='store_true', dest='xattrs', default=False,

View File

@ -16,7 +16,8 @@
)
from .xattrpp import XAttrMetadataPP
from .execafterdownload import ExecAfterDownloadPP
from .metadatafromtitle import MetadataFromTitlePP
from .metadatafromfield import MetadataFromFieldPP
from .metadatafromfield import MetadataFromTitlePP
from .movefilesafterdownload import MoveFilesAfterDownloadPP
from .sponskrub import SponSkrubPP
@ -39,6 +40,7 @@ def get_postprocessor(key):
'FFmpegSubtitlesConvertorPP',
'FFmpegVideoConvertorPP',
'FFmpegVideoRemuxerPP',
'MetadataFromFieldPP',
'MetadataFromTitlePP',
'MoveFilesAfterDownloadPP',
'SponSkrubPP',

View File

@ -0,0 +1,66 @@
from __future__ import unicode_literals
import re
from .common import PostProcessor
from ..compat import compat_str
class MetadataFromFieldPP(PostProcessor):
regex = r'(?P<field>\w+):(?P<format>.+)$'
def __init__(self, downloader, formats):
PostProcessor.__init__(self, downloader)
assert isinstance(formats, (list, tuple))
self._data = []
for f in formats:
assert isinstance(f, compat_str)
match = re.match(self.regex, f)
assert match is not None
self._data.append({
'field': match.group('field'),
'format': match.group('format'),
'regex': self.format_to_regex(match.group('format'))})
def format_to_regex(self, fmt):
r"""
Converts a string like
'%(title)s - %(artist)s'
to a regex like
'(?P<title>.+)\ \-\ (?P<artist>.+)'
"""
if not re.search(r'%\(\w+\)s', fmt):
return fmt
lastpos = 0
regex = ''
# replace %(..)s with regex group and escape other string parts
for match in re.finditer(r'%\((\w+)\)s', fmt):
regex += re.escape(fmt[lastpos:match.start()])
regex += r'(?P<' + match.group(1) + r'>[^\r\n]+)'
lastpos = match.end()
if lastpos < len(fmt):
regex += re.escape(fmt[lastpos:])
return regex
def run(self, info):
for dictn in self._data:
field, regex = dictn['field'], dictn['regex']
if field not in info:
self.report_warning('Video doesnot have a %s' % field)
continue
self.write_debug('Searching for r"%s" in %s' % (regex, field))
match = re.search(regex, info[field])
if match is None:
self.report_warning('Could not interpret video %s as "%s"' % (field, dictn['format']))
continue
for attribute, value in match.groupdict().items():
info[attribute] = value
self.to_screen('parsed %s from %s: %s' % (attribute, field, value if value is not None else 'NA'))
return [], info
class MetadataFromTitlePP(MetadataFromFieldPP): # for backward compatibility
def __init__(self, downloader, titleformat):
super(MetadataFromTitlePP, self).__init__(downloader, ['title:%s' % titleformat])
self._titleformat = titleformat
self._titleregex = self._data[0]['regex']

View File

@ -1,44 +0,0 @@
from __future__ import unicode_literals
import re
from .common import PostProcessor
class MetadataFromTitlePP(PostProcessor):
def __init__(self, downloader, titleformat):
super(MetadataFromTitlePP, self).__init__(downloader)
self._titleformat = titleformat
self._titleregex = (self.format_to_regex(titleformat)
if re.search(r'%\(\w+\)s', titleformat)
else titleformat)
def format_to_regex(self, fmt):
r"""
Converts a string like
'%(title)s - %(artist)s'
to a regex like
'(?P<title>.+)\ \-\ (?P<artist>.+)'
"""
lastpos = 0
regex = ''
# replace %(..)s with regex group and escape other string parts
for match in re.finditer(r'%\((\w+)\)s', fmt):
regex += re.escape(fmt[lastpos:match.start()])
regex += r'(?P<' + match.group(1) + '>.+)'
lastpos = match.end()
if lastpos < len(fmt):
regex += re.escape(fmt[lastpos:])
return regex
def run(self, info):
title = info['title']
match = re.match(self._titleregex, title)
if match is None:
self.to_screen('Could not interpret title of video as "%s"' % self._titleformat)
return [], info
for attribute, value in match.groupdict().items():
info[attribute] = value
self.to_screen('parsed %s: %s' % (attribute, value if value is not None else 'NA'))
return [], info