From a473574d9b64a755c3172fd57aea27f0d89a8ad5 Mon Sep 17 00:00:00 2001 From: Andrej Peterka Date: Thu, 19 Jun 2025 14:01:56 +0200 Subject: [PATCH 1/7] Improve merging of SCC subs --- pycaption/scc/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pycaption/scc/__init__.py b/pycaption/scc/__init__.py index 0d3a3ff1..25c0b37c 100644 --- a/pycaption/scc/__init__.py +++ b/pycaption/scc/__init__.py @@ -286,7 +286,12 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0, merge_ca nodes_to_append.append(CaptionNode(CaptionNode.BREAK)) captions_raw.remove(dupe_caption) - current_captions_with_same_time[0].nodes.extend(nodes_to_append) + if len(nodes_to_append) > 0: + if nodes_to_append[-1].type_ == CaptionNode.BREAK: + nodes_to_append.pop() + + if nodes_to_append: + current_captions_with_same_time[0].nodes.extend(nodes_to_append) captions = CaptionSet({lang: captions_raw}) From a7a8d1e452c870736259bf5acdfca8c63b06ae1d Mon Sep 17 00:00:00 2001 From: Andrej Peterka Date: Thu, 19 Jun 2025 14:21:01 +0200 Subject: [PATCH 2/7] Add a function to remove layout info from captions --- pycaption/base.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pycaption/base.py b/pycaption/base.py index e55b39b4..8c3ca163 100644 --- a/pycaption/base.py +++ b/pycaption/base.py @@ -415,6 +415,15 @@ def remove_empty_captions(self): out_captions.append(caption) self.set_captions(lang, out_captions) + def remove_layout_info(self): + for lang in self.get_languages(): + captions = self.get_captions(lang) + for caption in captions: + caption.layout_info = None + for node in caption.nodes: + node.layout_info = None + + # Functions def merge_concurrent_captions(caption_set): From 28347c6e89cf235611b6cc5219272159f3386bf8 Mon Sep 17 00:00:00 2001 From: Andrej Peterka Date: Thu, 19 Jun 2025 15:48:45 +0200 Subject: [PATCH 3/7] Move some helper functions to pycaptions core --- pycaption/base.py | 59 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/pycaption/base.py b/pycaption/base.py index 8c3ca163..b706af06 100644 --- a/pycaption/base.py +++ b/pycaption/base.py @@ -1,6 +1,7 @@ import os import re from collections import defaultdict +from collections import OrderedDict from datetime import timedelta from numbers import Number @@ -423,6 +424,64 @@ def remove_layout_info(self): for node in caption.nodes: node.layout_info = None + @staticmethod + def _group_captions_by_start_time(caps: CaptionList): + """ + Groups captions that have the same start time. + + :param caps: + :return: + """ + + caps_start_time = OrderedDict() + for i, cap in enumerate(caps): + if cap.start not in caps_start_time: + caps_start_time[cap.start] = [cap] + else: + caps_start_time[cap.start].append(cap) + + # order by start timestamp + caps_start_time = OrderedDict(sorted(caps_start_time.items(), key=lambda item: item[0])) + + # check if captions with the same start time also have the same end time + # fail if different end times are found - this is not (yet?) supported + caps_final = [] + for start_time, caps_list in caps_start_time.items(): + if len(caps_list) == 1: + caps_final.append(caps_list) + else: + end_times = list(set([c.end for c in caps_list])) + if len(end_times) != 1: + raise ValueError("Unsupported subtitles - overlapping subtitles with different end times found") + else: + caps_final.append(caps_list) + return caps_final + + def make_sure_of_sane_start_times_and_gap(self, min_sub_gap_ms=250): + """ + Makes sure that the start of a caption is not identical to end of the previous one + and that there is a minimum gap between captions. + :param min_sub_gap_ms: + :return: + """ + for lang in self.get_languages(): + _captions = self.get_captions(lang) + _captions_by_start = self._group_captions_by_start_time(_captions) + + for i, caps in enumerate(_captions_by_start): + if i == 0: + continue + + prev_caption_end = _captions_by_start[i - 1][0].end + curr_caption_start = caps[0].start + curr_caption_end = caps[0].end + + if curr_caption_start < prev_caption_end: + for c in caps: + c.start = min(prev_caption_end + (min_sub_gap_ms * 1000), c.end) + elif curr_caption_start == prev_caption_end: + for c in caps: + c.start = min(prev_caption_end + (min_sub_gap_ms * 1000), curr_caption_end) # Functions From 5c078c184d6f1784d30f6b8c450e1650d1f5c2bb Mon Sep 17 00:00:00 2001 From: Andrej Peterka Date: Thu, 19 Jun 2025 15:58:11 +0200 Subject: [PATCH 4/7] Move merging of captions from SCC reader to CaptionList --- pycaption/base.py | 23 +++++++++++++++++++++++ pycaption/scc/__init__.py | 25 +------------------------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pycaption/base.py b/pycaption/base.py index b706af06..fe3e7093 100644 --- a/pycaption/base.py +++ b/pycaption/base.py @@ -483,6 +483,29 @@ def make_sure_of_sane_start_times_and_gap(self, min_sub_gap_ms=250): for c in caps: c.start = min(prev_caption_end + (min_sub_gap_ms * 1000), curr_caption_end) + def merge_captions(self): + """ + Merge captions that have the same start and end time. + We do this by merging their nodes together, separating them with a line break. + """ + for lang in self.get_languages(): + captions_raw = self.get_captions(lang) + _captions_by_start = self._group_captions_by_start_time(captions_raw) + + all_captions_with_same_time = [l for l in _captions_by_start if len(l) > 1] + for current_captions_with_same_time in all_captions_with_same_time: + nodes_to_append = [CaptionNode(CaptionNode.BREAK)] + for dupe_caption in current_captions_with_same_time[1:]: + nodes_to_append.extend(dupe_caption.nodes) + nodes_to_append.append(CaptionNode(CaptionNode.BREAK)) + captions_raw.remove(dupe_caption) + + if len(nodes_to_append) > 0: + if nodes_to_append[-1].type_ == CaptionNode.BREAK: + nodes_to_append.pop() + + if nodes_to_append: + current_captions_with_same_time[0].nodes.extend(nodes_to_append) # Functions def merge_concurrent_captions(caption_set): diff --git a/pycaption/scc/__init__.py b/pycaption/scc/__init__.py index 25c0b37c..6bd81462 100644 --- a/pycaption/scc/__init__.py +++ b/pycaption/scc/__init__.py @@ -236,7 +236,7 @@ def detect(self, content): else: return False - def read(self, content, lang="en-US", simulate_roll_up=False, offset=0, merge_captions=False): + def read(self, content, lang="en-US", simulate_roll_up=False, offset=0): """Converts the unicode string into a CaptionSet :type content: str @@ -253,11 +253,6 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0, merge_ca :type offset: int :param offset: - :type merge_captions: bool - :param merge_captions: If True, we will merge captions that have the same - start and end time. We do this by merging their nodes together, separating - them with a line break. - :rtype: CaptionSet """ if not isinstance(content, str): @@ -275,24 +270,6 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0, merge_ca self._flush_implicit_buffers(self.buffer_dict.active_key) captions_raw = self.caption_stash.get_all() - if merge_captions: - _captions_by_start = self._group_captions_by_start_time(captions_raw) - - all_captions_with_same_time = [l for l in _captions_by_start if len(l) > 1] - for current_captions_with_same_time in all_captions_with_same_time: - nodes_to_append = [CaptionNode(CaptionNode.BREAK)] - for dupe_caption in current_captions_with_same_time[1:]: - nodes_to_append.extend(dupe_caption.nodes) - nodes_to_append.append(CaptionNode(CaptionNode.BREAK)) - captions_raw.remove(dupe_caption) - - if len(nodes_to_append) > 0: - if nodes_to_append[-1].type_ == CaptionNode.BREAK: - nodes_to_append.pop() - - if nodes_to_append: - current_captions_with_same_time[0].nodes.extend(nodes_to_append) - captions = CaptionSet({lang: captions_raw}) # check captions for incorrect lengths From 35fe3d1f9227868a52c9bb627f8aa9272e2060ad Mon Sep 17 00:00:00 2001 From: Andrej Peterka Date: Fri, 20 Jun 2025 08:51:24 +0200 Subject: [PATCH 5/7] Enable merging layout when merging subs --- pycaption/base.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/pycaption/base.py b/pycaption/base.py index fe3e7093..e12dacb0 100644 --- a/pycaption/base.py +++ b/pycaption/base.py @@ -483,7 +483,7 @@ def make_sure_of_sane_start_times_and_gap(self, min_sub_gap_ms=250): for c in caps: c.start = min(prev_caption_end + (min_sub_gap_ms * 1000), curr_caption_end) - def merge_captions(self): + def merge_captions(self, merge_layout_info=False): """ Merge captions that have the same start and end time. We do this by merging their nodes together, separating them with a line break. @@ -505,7 +505,22 @@ def merge_captions(self): nodes_to_append.pop() if nodes_to_append: - current_captions_with_same_time[0].nodes.extend(nodes_to_append) + current_caption = current_captions_with_same_time[0] + current_caption.nodes.extend(nodes_to_append) + if merge_layout_info: + layout_info = current_caption.layout_info + if not layout_info: + for node in current_caption.nodes: + if node.type_ == CaptionNode.TEXT: + layout_info = node.layout_info + if layout_info: + break + if not layout_info: + return + + current_caption.layout_info = layout_info + for node in current_captions_with_same_time[0].nodes: + node.layout_info = layout_info # Functions def merge_concurrent_captions(caption_set): From ebfec97025c300fd63f99ebc9c76db8ae942dd5d Mon Sep 17 00:00:00 2001 From: Andrej Peterka Date: Fri, 20 Jun 2025 14:35:06 +0200 Subject: [PATCH 6/7] Add comments --- pycaption/base.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/pycaption/base.py b/pycaption/base.py index e12dacb0..68e81c3b 100644 --- a/pycaption/base.py +++ b/pycaption/base.py @@ -383,6 +383,9 @@ def adjust_caption_timing(self, offset=0, rate_skew=1.0): self.set_captions(lang, out_captions) def strip_html_tags(self): + """ + Iterates all captions and nodes in all languages and strips HTML tags (matching the RE_HTML_STRIP regex) + """ for lang in self.get_languages(): captions = self.get_captions(lang) out_captions = CaptionList() @@ -394,6 +397,9 @@ def strip_html_tags(self): self.set_captions(lang, out_captions) def strip_ass_tags(self): + """ + Iterates all captions and nodes in all languages and strips ASS tags (matching the RE_ASS_STRIP regex) + """ for lang in self.get_languages(): captions = self.get_captions(lang) out_captions = CaptionList() @@ -405,6 +411,9 @@ def strip_ass_tags(self): self.set_captions(lang, out_captions) def remove_empty_captions(self): + """ + Removes captions which have only empty TEXT nodes. + """ for lang in self.get_languages(): captions = self.get_captions(lang) out_captions = CaptionList() @@ -417,6 +426,9 @@ def remove_empty_captions(self): self.set_captions(lang, out_captions) def remove_layout_info(self): + """ + Removes layout info from all captions and nodes in all languages. + """ for lang in self.get_languages(): captions = self.get_captions(lang) for caption in captions: @@ -425,12 +437,12 @@ def remove_layout_info(self): node.layout_info = None @staticmethod - def _group_captions_by_start_time(caps: CaptionList): + def _group_captions_by_start_time(caps: CaptionList) -> list[list[Caption]]: """ Groups captions that have the same start time. - :param caps: - :return: + :param caps: CaptionList of captions to group + :return: List of lists of captions, where each inner list contains captions with the same start time. """ caps_start_time = OrderedDict() @@ -461,8 +473,7 @@ def make_sure_of_sane_start_times_and_gap(self, min_sub_gap_ms=250): """ Makes sure that the start of a caption is not identical to end of the previous one and that there is a minimum gap between captions. - :param min_sub_gap_ms: - :return: + :param min_sub_gap_ms: minimum gap in milliseconds that should be between captions """ for lang in self.get_languages(): _captions = self.get_captions(lang) From 319b81b9f183fc3659976fdfe9be5f6efe6abed4 Mon Sep 17 00:00:00 2001 From: Andrej Peterka Date: Fri, 20 Jun 2025 14:41:46 +0200 Subject: [PATCH 7/7] comments --- pycaption/base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pycaption/base.py b/pycaption/base.py index 68e81c3b..0b3c27f1 100644 --- a/pycaption/base.py +++ b/pycaption/base.py @@ -432,7 +432,10 @@ def remove_layout_info(self): for lang in self.get_languages(): captions = self.get_captions(lang) for caption in captions: + # strip layout info from caption caption.layout_info = None + + # strip layout info from all nodes in caption for node in caption.nodes: node.layout_info = None @@ -480,6 +483,7 @@ def make_sure_of_sane_start_times_and_gap(self, min_sub_gap_ms=250): _captions_by_start = self._group_captions_by_start_time(_captions) for i, caps in enumerate(_captions_by_start): + # skip the first caption, as it has no previous caption to compare to if i == 0: continue