[LinkedInLearning] Add subtitles (#1077)

Authored by: Ashish0804
Closes #1072
This commit is contained in:
Ashish Gupta 2021-09-25 16:55:33 +05:30 committed by GitHub
parent e99b2d2771
commit 8dc831f715
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
from itertools import zip_longest
import re
from .common import InfoExtractor
@ -8,6 +9,8 @@
ExtractorError,
float_or_none,
int_or_none,
srt_subtitles_timecode,
try_get,
urlencode_postdata,
urljoin,
)
@ -86,6 +89,16 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
},
}
def json2srt(self, transcript_lines, duration=None):
srt_data = ''
for line, (line_dict, next_dict) in enumerate(zip_longest(transcript_lines, transcript_lines[1:])):
start_time, caption = line_dict['transcriptStartAt'] / 1000, line_dict['caption']
end_time = next_dict['transcriptStartAt'] / 1000 if next_dict else duration or start_time + 1
srt_data += '%d\n%s --> %s\n%s\n' % (line + 1, srt_subtitles_timecode(start_time),
srt_subtitles_timecode(end_time),
caption)
return srt_data
def _real_extract(self, url):
course_slug, video_slug = self._match_valid_url(url).groups()
@ -101,6 +114,7 @@ def _real_extract(self, url):
formats.append({
'format_id': 'progressive-%dp' % height,
'url': progressive_url,
'ext': 'mp4',
'height': height,
'width': width,
'source_preference': 1,
@ -128,6 +142,14 @@ def _real_extract(self, url):
# However, unless someone can confirm this, the old
# behaviour is being kept as-is
self._sort_formats(formats, ('res', 'source_preference'))
subtitles = {}
duration = int_or_none(video_data.get('durationInSeconds'))
transcript_lines = try_get(video_data, lambda x: x['transcript']['lines'], expected_type=list)
if transcript_lines:
subtitles['en'] = [{
'ext': 'srt',
'data': self.json2srt(transcript_lines, duration)
}]
return {
'id': self._get_video_id(video_data, course_slug, video_slug),
@ -135,7 +157,8 @@ def _real_extract(self, url):
'formats': formats,
'thumbnail': video_data.get('defaultThumbnail'),
'timestamp': float_or_none(video_data.get('publishedOn'), 1000),
'duration': int_or_none(video_data.get('durationInSeconds')),
'duration': duration,
'subtitles': subtitles,
}