Extract comments only when needed #95 (Closes #94)

This commit is contained in:
pukkandan 2021-02-28 20:26:08 +05:30 committed by GitHub
parent 1cf376f55a
commit 277d6ff5f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 57 additions and 17 deletions

View File

@ -245,7 +245,7 @@ ## Video Selection:
"OUTPUT TEMPLATE" for a list of available "OUTPUT TEMPLATE" for a list of available
keys) to match if the key is present, !key keys) to match if the key is present, !key
to check if the key is not present, to check if the key is not present,
key>NUMBER (like "comment_count > 12", also key>NUMBER (like "view_count > 12", also
works with >=, <, <=, !=, =) to compare works with >=, <, <=, !=, =) to compare
against a number, key = 'LITERAL' (like against a number, key = 'LITERAL' (like
"uploader = 'Mike Smith'", also works with "uploader = 'Mike Smith'", also works with
@ -403,7 +403,9 @@ ## Filesystem Options:
--no-write-playlist-metafiles Do not write playlist metadata when using --no-write-playlist-metafiles Do not write playlist metadata when using
--write-info-json, --write-description etc. --write-info-json, --write-description etc.
--get-comments Retrieve video comments to be placed in the --get-comments Retrieve video comments to be placed in the
.info.json file .info.json file. The comments are fetched
even without this option if the extraction
is known to be quick
--load-info-json FILE JSON file containing the video information --load-info-json FILE JSON file containing the video information
(created with the "--write-info-json" (created with the "--write-info-json"
option) option)
@ -814,7 +816,7 @@ # OUTPUT TEMPLATE
- `dislike_count` (numeric): Number of negative ratings of the video - `dislike_count` (numeric): Number of negative ratings of the video
- `repost_count` (numeric): Number of reposts of the video - `repost_count` (numeric): Number of reposts of the video
- `average_rating` (numeric): Average rating give by users, the scale used depends on the webpage - `average_rating` (numeric): Average rating give by users, the scale used depends on the webpage
- `comment_count` (numeric): Number of comments on the video - `comment_count` (numeric): Number of comments on the video (For some extractors, comments are only downloaded at the end, and so this field cannot be used)
- `age_limit` (numeric): Age restriction for the video (years) - `age_limit` (numeric): Age restriction for the video (years)
- `is_live` (boolean): Whether this video is a live stream or a fixed-length video - `is_live` (boolean): Whether this video is a live stream or a fixed-length video
- `was_live` (boolean): Whether this video was originally a live stream - `was_live` (boolean): Whether this video was originally a live stream

View File

@ -2041,6 +2041,7 @@ def print_optional(field):
self.to_stdout(formatSeconds(info_dict['duration'])) self.to_stdout(formatSeconds(info_dict['duration']))
print_mandatory('format') print_mandatory('format')
if self.params.get('forcejson', False): if self.params.get('forcejson', False):
self.post_extract(info_dict)
self.to_stdout(json.dumps(info_dict)) self.to_stdout(json.dumps(info_dict))
def process_info(self, info_dict): def process_info(self, info_dict):
@ -2064,6 +2065,7 @@ def process_info(self, info_dict):
if self._match_entry(info_dict, incomplete=False) is not None: if self._match_entry(info_dict, incomplete=False) is not None:
return return
self.post_extract(info_dict)
self._num_downloads += 1 self._num_downloads += 1
info_dict = self.pre_process(info_dict) info_dict = self.pre_process(info_dict)
@ -2497,6 +2499,7 @@ def download(self, url_list):
raise raise
else: else:
if self.params.get('dump_single_json', False): if self.params.get('dump_single_json', False):
self.post_extract(res)
self.to_stdout(json.dumps(res)) self.to_stdout(json.dumps(res))
return self._download_retcode return self._download_retcode
@ -2545,6 +2548,24 @@ def run_pp(self, pp, infodict, files_to_move={}):
del files_to_move[old_filename] del files_to_move[old_filename]
return files_to_move, infodict return files_to_move, infodict
@staticmethod
def post_extract(info_dict):
def actual_post_extract(info_dict):
if info_dict.get('_type') in ('playlist', 'multi_video'):
for video_dict in info_dict.get('entries', {}):
actual_post_extract(video_dict)
return
if '__post_extractor' not in info_dict:
return
post_extractor = info_dict['__post_extractor']
if post_extractor:
info_dict.update(post_extractor().items())
del info_dict['__post_extractor']
return
actual_post_extract(info_dict)
def pre_process(self, ie_info): def pre_process(self, ie_info):
info = dict(ie_info) info = dict(ie_info)
for pp in self._pps['beforedl']: for pp in self._pps['beforedl']:

View File

@ -255,10 +255,6 @@ def _real_extract(self, url):
info['uploader'] = self._html_search_meta( info['uploader'] = self._html_search_meta(
'author', webpage, 'uploader', default=None) 'author', webpage, 'uploader', default=None)
comments = None
if self._downloader.params.get('getcomments', False):
comments = self._get_all_comment_pages(video_id)
raw_danmaku = self._get_raw_danmaku(video_id, cid) raw_danmaku = self._get_raw_danmaku(video_id, cid)
raw_tags = self._get_tags(video_id) raw_tags = self._get_tags(video_id)
@ -266,11 +262,18 @@ def _real_extract(self, url):
top_level_info = { top_level_info = {
'raw_danmaku': raw_danmaku, 'raw_danmaku': raw_danmaku,
'comments': comments,
'comment_count': len(comments) if comments is not None else None,
'tags': tags, 'tags': tags,
'raw_tags': raw_tags, 'raw_tags': raw_tags,
} }
if self._downloader.params.get('getcomments', False):
def get_comments():
comments = self._get_all_comment_pages(video_id)
return {
'comments': comments,
'comment_count': len(comments)
}
top_level_info['__post_extractor'] = get_comments
''' '''
# Requires https://github.com/m13253/danmaku2ass which is licenced under GPL3 # Requires https://github.com/m13253/danmaku2ass which is licenced under GPL3

View File

@ -294,6 +294,14 @@ class InfoExtractor(object):
players on other sites. Can be True (=always allowed), players on other sites. Can be True (=always allowed),
False (=never allowed), None (=unknown), or a string False (=never allowed), None (=unknown), or a string
specifying the criteria for embedability (Eg: 'whitelist'). specifying the criteria for embedability (Eg: 'whitelist').
__post_extractor: A function to be called just before the metadata is
written to either disk, logger or console. The function
must return a dict which will be added to the info_dict.
This is usefull for additional information that is
time-consuming to extract. Note that the fields thus
extracted will not be available to output template and
match_filter. So, only "comments" and "comment_count" are
currently allowed to be extracted via this method.
The following fields should only be used when the video belongs to some logical The following fields should only be used when the video belongs to some logical
chapter or section: chapter or section:

View File

@ -2012,9 +2012,10 @@ def chapter_time(mmlir):
# Get comments # Get comments
# TODO: Refactor and move to seperate function # TODO: Refactor and move to seperate function
if get_comments: def extract_comments():
expected_video_comment_count = 0 expected_video_comment_count = 0
video_comments = [] video_comments = []
comment_xsrf = xsrf_token
def find_value(html, key, num_chars=2, separator='"'): def find_value(html, key, num_chars=2, separator='"'):
pos_begin = html.find(key) + len(key) + num_chars pos_begin = html.find(key) + len(key) + num_chars
@ -2083,7 +2084,7 @@ def get_continuation(continuation, session_token, replies=False):
self.to_screen('Downloading comments') self.to_screen('Downloading comments')
while continuations: while continuations:
continuation = continuations.pop() continuation = continuations.pop()
comment_response = get_continuation(continuation, xsrf_token) comment_response = get_continuation(continuation, comment_xsrf)
if not comment_response: if not comment_response:
continue continue
if list(search_dict(comment_response, 'externalErrorMessage')): if list(search_dict(comment_response, 'externalErrorMessage')):
@ -2094,7 +2095,7 @@ def get_continuation(continuation, session_token, replies=False):
continue continue
# not sure if this actually helps # not sure if this actually helps
if 'xsrf_token' in comment_response: if 'xsrf_token' in comment_response:
xsrf_token = comment_response['xsrf_token'] comment_xsrf = comment_response['xsrf_token']
item_section = comment_response['response']['continuationContents']['itemSectionContinuation'] item_section = comment_response['response']['continuationContents']['itemSectionContinuation']
if first_continuation: if first_continuation:
@ -2123,7 +2124,7 @@ def get_continuation(continuation, session_token, replies=False):
while reply_continuations: while reply_continuations:
time.sleep(1) time.sleep(1)
continuation = reply_continuations.pop() continuation = reply_continuations.pop()
replies_data = get_continuation(continuation, xsrf_token, True) replies_data = get_continuation(continuation, comment_xsrf, True)
if not replies_data or 'continuationContents' not in replies_data[1]['response']: if not replies_data or 'continuationContents' not in replies_data[1]['response']:
continue continue
@ -2152,10 +2153,13 @@ def get_continuation(continuation, session_token, replies=False):
time.sleep(1) time.sleep(1)
self.to_screen('Total comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count)) self.to_screen('Total comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count))
info.update({ return {
'comments': video_comments, 'comments': video_comments,
'comment_count': expected_video_comment_count 'comment_count': expected_video_comment_count
}) }
if get_comments:
info['__post_extractor'] = extract_comments
self.mark_watched(video_id, player_response) self.mark_watched(video_id, player_response)

View File

@ -347,7 +347,7 @@ def _dict_from_multiple_values_options_callback(
'Specify any key (see "OUTPUT TEMPLATE" for a list of available keys) to ' 'Specify any key (see "OUTPUT TEMPLATE" for a list of available keys) to '
'match if the key is present, ' 'match if the key is present, '
'!key to check if the key is not present, ' '!key to check if the key is not present, '
'key>NUMBER (like "comment_count > 12", also works with ' 'key>NUMBER (like "view_count > 12", also works with '
'>=, <, <=, !=, =) to compare against a number, ' '>=, <, <=, !=, =) to compare against a number, '
'key = \'LITERAL\' (like "uploader = \'Mike Smith\'", also works with !=) ' 'key = \'LITERAL\' (like "uploader = \'Mike Smith\'", also works with !=) '
'to match against a string literal ' 'to match against a string literal '
@ -985,7 +985,9 @@ def _dict_from_multiple_values_options_callback(
filesystem.add_option( filesystem.add_option(
'--get-comments', '--get-comments',
action='store_true', dest='getcomments', default=False, action='store_true', dest='getcomments', default=False,
help='Retrieve video comments to be placed in the .info.json file') help=(
'Retrieve video comments to be placed in the .info.json file. '
'The comments are fetched even without this option if the extraction is known to be quick'))
filesystem.add_option( filesystem.add_option(
'--load-info-json', '--load-info', '--load-info-json', '--load-info',
dest='load_info_filename', metavar='FILE', dest='load_info_filename', metavar='FILE',