1
1
mirror of https://github.com/ytdl-org/youtube-dl synced 2025-01-24 08:27:33 +01:00

Merge pull request #8348 from remitamine/dfxp2srt-text

[utils] fix dfxp2srt text extraction(fixes #8055)
This commit is contained in:
remitamine 2016-02-02 18:36:26 +01:00
commit 4e0cff2a50

View File

@ -2017,20 +2017,27 @@ def dfxp2srt(dfxp_data):
'ttaf1': 'http://www.w3.org/2006/10/ttaf1', 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
}) })
class TTMLPElementParser:
out = ''
def start(self, tag, attrib):
if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
self.out += '\n'
def end(self, tag):
pass
def data(self, data):
self.out += data
def close(self):
return self.out.strip()
def parse_node(node): def parse_node(node):
str_or_empty = functools.partial(str_or_none, default='') target = TTMLPElementParser()
parser = xml.etree.ElementTree.XMLParser(target=target)
out = str_or_empty(node.text) parser.feed(xml.etree.ElementTree.tostring(node))
return parser.close()
for child in node:
if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
out += '\n' + str_or_empty(child.tail)
elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
out += str_or_empty(parse_node(child))
else:
out += str_or_empty(xml.etree.ElementTree.tostring(child))
return out
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = [] out = []