Parse metadata from multiple fields

Closes #196
This commit is contained in:
pukkandan 2021-03-25 03:32:15 +05:30
parent 3700c7ef10
commit 143db31d48
No known key found for this signature in database
GPG Key ID: 0F00D95A001F4698
5 changed files with 143 additions and 115 deletions

View File

@ -670,18 +670,24 @@ ## Post-Processing Options:
--add-metadata Write metadata to the video file
--no-add-metadata Do not write metadata (default)
--parse-metadata FIELD:FORMAT Parse additional metadata like title/artist
from other fields. Give field name to
extract data from, and format of the field
seperated by a ":". Either regular
expression with named capture groups or a
similar syntax to the output template can
also be used. The parsed parameters replace
any existing values and can be use in
output template. This option can be used
multiple times. Example: --parse-metadata
"title:%(artist)s - %(title)s" matches a
title like "Coldplay - Paradise". Example
(regex): --parse-metadata
from other fields. Give a template or field
name to extract data from and the format to
interpret it as, seperated by a ":". Either
regular expression with named capture
groups or a similar syntax to the output
template can be used for the FORMAT.
Similarly, the syntax for output template
can be used for FIELD to parse the data
from multiple fields. The parsed parameters
replace any existing values and can be used
in output templates. This option can be
used multiple times. Example: --parse-
metadata "title:%(artist)s - %(title)s"
matches a title like "Coldplay - Paradise".
Example: --parse-metadata "%(series)s
%(episode_number)s:%(title)s" sets the
title using series and episode number.
Example (regex): --parse-metadata
"description:Artist - (?P<artist>.+?)"
--xattrs Write metadata to the video file's xattrs
(using dublin core and xdg standards)

View File

@ -67,6 +67,7 @@
float_or_none,
format_bytes,
format_field,
FORMAT_RE,
formatSeconds,
GeoRestrictedError,
int_or_none,
@ -772,95 +773,93 @@ def parse_outtmpl(self):
'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
return outtmpl_dict
def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None):
""" Make the template and info_dict suitable for substitution (outtmpl % info_dict)"""
template_dict = dict(info_dict)
# duration_string
template_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
formatSeconds(info_dict['duration'], '-')
if info_dict.get('duration', None) is not None
else None)
# epoch
template_dict['epoch'] = int(time.time())
# autonumber
autonumber_size = self.params.get('autonumber_size')
if autonumber_size is None:
autonumber_size = 5
template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
# resolution if not defined
if template_dict.get('resolution') is None:
if template_dict.get('width') and template_dict.get('height'):
template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
elif template_dict.get('height'):
template_dict['resolution'] = '%sp' % template_dict['height']
elif template_dict.get('width'):
template_dict['resolution'] = '%dx?' % template_dict['width']
if sanitize is None:
sanitize = lambda k, v: v
template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
for k, v in template_dict.items()
if v is not None and not isinstance(v, (list, tuple, dict)))
na = self.params.get('outtmpl_na_placeholder', 'NA')
template_dict = collections.defaultdict(lambda: na, template_dict)
# For fields playlist_index and autonumber convert all occurrences
# of %(field)s to %(field)0Nd for backward compatibility
field_size_compat_map = {
'playlist_index': len(str(template_dict['n_entries'])),
'autonumber': autonumber_size,
}
FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
if mobj:
outtmpl = re.sub(
FIELD_SIZE_COMPAT_RE,
r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
outtmpl)
numeric_fields = list(self._NUMERIC_FIELDS)
# Format date
FORMAT_DATE_RE = FORMAT_RE.format(r'(?P<key>(?P<field>\w+)>(?P<format>.+?))')
for mobj in re.finditer(FORMAT_DATE_RE, outtmpl):
conv_type, field, frmt, key = mobj.group('type', 'field', 'format', 'key')
if key in template_dict:
continue
value = strftime_or_none(template_dict.get(field), frmt, na)
if conv_type in 'crs': # string
value = sanitize(field, value)
else: # number
numeric_fields.append(key)
value = float_or_none(value, default=None)
if value is not None:
template_dict[key] = value
# Missing numeric fields used together with integer presentation types
# in format specification will break the argument substitution since
# string NA placeholder is returned for missing fields. We will patch
# output template for missing fields to meet string presentation type.
for numeric_field in numeric_fields:
if numeric_field not in template_dict:
outtmpl = re.sub(
FORMAT_RE.format(re.escape(numeric_field)),
r'%({0})s'.format(numeric_field), outtmpl)
return outtmpl, template_dict
def _prepare_filename(self, info_dict, tmpl_type='default'):
try:
template_dict = dict(info_dict)
template_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
formatSeconds(info_dict['duration'], '-')
if info_dict.get('duration', None) is not None
else None)
template_dict['epoch'] = int(time.time())
autonumber_size = self.params.get('autonumber_size')
if autonumber_size is None:
autonumber_size = 5
template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
if template_dict.get('resolution') is None:
if template_dict.get('width') and template_dict.get('height'):
template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
elif template_dict.get('height'):
template_dict['resolution'] = '%sp' % template_dict['height']
elif template_dict.get('width'):
template_dict['resolution'] = '%dx?' % template_dict['width']
sanitize = lambda k, v: sanitize_filename(
compat_str(v),
restricted=self.params.get('restrictfilenames'),
is_id=(k == 'id' or k.endswith('_id')))
template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
for k, v in template_dict.items()
if v is not None and not isinstance(v, (list, tuple, dict)))
na = self.params.get('outtmpl_na_placeholder', 'NA')
template_dict = collections.defaultdict(lambda: na, template_dict)
outtmpl = self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])
force_ext = OUTTMPL_TYPES.get(tmpl_type)
# For fields playlist_index and autonumber convert all occurrences
# of %(field)s to %(field)0Nd for backward compatibility
field_size_compat_map = {
'playlist_index': len(str(template_dict['n_entries'])),
'autonumber': autonumber_size,
}
FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
if mobj:
outtmpl = re.sub(
FIELD_SIZE_COMPAT_RE,
r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
outtmpl)
# As of [1] format syntax is:
# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
FORMAT_RE = r'''(?x)
(?<!%)
%
\({0}\) # mapping key
(?:[#0\-+ ]+)? # conversion flags (optional)
(?:\d+)? # minimum field width (optional)
(?:\.\d+)? # precision (optional)
[hlL]? # length modifier (optional)
(?P<type>[diouxXeEfFgGcrs%]) # conversion type
'''
numeric_fields = list(self._NUMERIC_FIELDS)
# Format date
FORMAT_DATE_RE = FORMAT_RE.format(r'(?P<key>(?P<field>\w+)>(?P<format>.+?))')
for mobj in re.finditer(FORMAT_DATE_RE, outtmpl):
conv_type, field, frmt, key = mobj.group('type', 'field', 'format', 'key')
if key in template_dict:
continue
value = strftime_or_none(template_dict.get(field), frmt, na)
if conv_type in 'crs': # string
value = sanitize(field, value)
else: # number
numeric_fields.append(key)
value = float_or_none(value, default=None)
if value is not None:
template_dict[key] = value
# Missing numeric fields used together with integer presentation types
# in format specification will break the argument substitution since
# string NA placeholder is returned for missing fields. We will patch
# output template for missing fields to meet string presentation type.
for numeric_field in numeric_fields:
if numeric_field not in template_dict:
outtmpl = re.sub(
FORMAT_RE.format(re.escape(numeric_field)),
r'%({0})s'.format(numeric_field), outtmpl)
outtmpl, template_dict = self.prepare_outtmpl(outtmpl, info_dict, sanitize)
# expand_path translates '%%' into '%' and '$$' into '$'
# correspondingly that is not what we want since we need to keep
@ -875,6 +874,7 @@ def _prepare_filename(self, info_dict, tmpl_type='default'):
# title "Hello $PATH", we don't want `$PATH` to be expanded.
filename = expand_path(outtmpl).replace(sep, '') % template_dict
force_ext = OUTTMPL_TYPES.get(tmpl_type)
if force_ext is not None:
filename = replace_extension(filename, force_ext, template_dict.get('ext'))

View File

@ -1147,13 +1147,18 @@ def _dict_from_multiple_values_options_callback(
metavar='FIELD:FORMAT', dest='metafromfield', action='append',
help=(
'Parse additional metadata like title/artist from other fields. '
'Give field name to extract data from, and format of the field seperated by a ":". '
'Give a template or field name to extract data from and the '
'format to interpret it as, seperated by a ":". '
'Either regular expression with named capture groups or a '
'similar syntax to the output template can also be used. '
'The parsed parameters replace any existing values and can be use in output template. '
'similar syntax to the output template can be used for the FORMAT. '
'Similarly, the syntax for output template can be used for FIELD '
'to parse the data from multiple fields. '
'The parsed parameters replace any existing values and can be used in output templates. '
'This option can be used multiple times. '
'Example: --parse-metadata "title:%(artist)s - %(title)s" matches a title like '
'"Coldplay - Paradise". '
'Example: --parse-metadata "%(series)s %(episode_number)s:%(title)s" '
'sets the title using series and episode number. '
'Example (regex): --parse-metadata "description:Artist - (?P<artist>.+?)"'))
postproc.add_option(
'--xattrs',

View File

@ -8,7 +8,7 @@
class MetadataFromFieldPP(PostProcessor):
regex = r'(?P<field>\w+):(?P<format>.+)$'
regex = r'(?P<in>.+):(?P<out>.+)$'
def __init__(self, downloader, formats):
PostProcessor.__init__(self, downloader)
@ -19,11 +19,20 @@ def __init__(self, downloader, formats):
match = re.match(self.regex, f)
assert match is not None
self._data.append({
'field': match.group('field'),
'format': match.group('format'),
'regex': self.format_to_regex(match.group('format'))})
'in': match.group('in'),
'out': match.group('out'),
'tmpl': self.field_to_template(match.group('in')),
'regex': self.format_to_regex(match.group('out')),
})
def format_to_regex(self, fmt):
@staticmethod
def field_to_template(tmpl):
if re.match(r'\w+$', tmpl):
return '%%(%s)s' % tmpl
return tmpl
@staticmethod
def format_to_regex(fmt):
r"""
Converts a string like
'%(title)s - %(artist)s'
@ -37,7 +46,7 @@ def format_to_regex(self, fmt):
# replace %(..)s with regex group and escape other string parts
for match in re.finditer(r'%\((\w+)\)s', fmt):
regex += re.escape(fmt[lastpos:match.start()])
regex += r'(?P<' + match.group(1) + r'>[^\r\n]+)'
regex += r'(?P<%s>[^\r\n]+)' % match.group(1)
lastpos = match.end()
if lastpos < len(fmt):
regex += re.escape(fmt[lastpos:])
@ -45,22 +54,16 @@ def format_to_regex(self, fmt):
def run(self, info):
for dictn in self._data:
field, regex = dictn['field'], dictn['regex']
if field not in info:
self.report_warning('Video doesnot have a %s' % field)
continue
data_to_parse = str_or_none(info[field])
if data_to_parse is None:
self.report_warning('Field %s cannot be parsed' % field)
continue
self.write_debug('Searching for r"%s" in %s' % (regex, field))
match = re.search(regex, data_to_parse)
tmpl, info_copy = self._downloader.prepare_outtmpl(dictn['tmpl'], info)
data_to_parse = tmpl % info_copy
self.write_debug('Searching for r"%s" in %s' % (dictn['regex'], tmpl))
match = re.search(dictn['regex'], data_to_parse)
if match is None:
self.report_warning('Could not interpret video %s as "%s"' % (field, dictn['format']))
self.report_warning('Could not interpret video %s as "%s"' % (dictn['in'], dictn['out']))
continue
for attribute, value in match.groupdict().items():
info[attribute] = value
self.to_screen('parsed %s from %s: %s' % (attribute, field, value if value is not None else 'NA'))
self.to_screen('parsed %s from "%s": %s' % (attribute, dictn['in'], value if value is not None else 'NA'))
return [], info

View File

@ -4205,6 +4205,20 @@ def q(qid):
'pl_infojson': 'info.json',
}
# As of [1] format syntax is:
# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
FORMAT_RE = r'''(?x)
(?<!%)
%
\({0}\) # mapping key
(?:[#0\-+ ]+)? # conversion flags (optional)
(?:\d+)? # minimum field width (optional)
(?:\.\d+)? # precision (optional)
[hlL]? # length modifier (optional)
(?P<type>[diouxXeEfFgGcrs%]) # conversion type
'''
def limit_length(s, length):
""" Add ellipses to overly long strings """