castlabs · andrej-peterka · Jun 26, 2025 · Jun 19, 2025 · Jun 19, 2025 · Jun 19, 2025
diff --git a/pycaption/base.py b/pycaption/base.py
@@ -1,6 +1,7 @@
 import os
 import re
 from collections import defaultdict
+from collections import OrderedDict
 from datetime import timedelta
 from numbers import Number
 
@@ -382,6 +383,9 @@ def adjust_caption_timing(self, offset=0, rate_skew=1.0):
             self.set_captions(lang, out_captions)
 
     def strip_html_tags(self):
+        """
+        Iterates all captions and nodes in all languages and strips HTML tags (matching the RE_HTML_STRIP regex)
+        """
         for lang in self.get_languages():
             captions = self.get_captions(lang)
             out_captions = CaptionList()
@@ -393,6 +397,9 @@ def strip_html_tags(self):
             self.set_captions(lang, out_captions)
 
     def strip_ass_tags(self):
+        """
+        Iterates all captions and nodes in all languages and strips ASS tags (matching the RE_ASS_STRIP regex)
+        """
         for lang in self.get_languages():
             captions = self.get_captions(lang)
             out_captions = CaptionList()
@@ -404,6 +411,9 @@ def strip_ass_tags(self):
             self.set_captions(lang, out_captions)
 
     def remove_empty_captions(self):
+        """
+        Removes captions which have only empty TEXT nodes.
+        """
         for lang in self.get_languages():
             captions = self.get_captions(lang)
             out_captions = CaptionList()
@@ -415,6 +425,117 @@ def remove_empty_captions(self):
                     out_captions.append(caption)
             self.set_captions(lang, out_captions)
 
+    def remove_layout_info(self):
+        """
+        Removes layout info from all captions and nodes in all languages.
+        """
+        for lang in self.get_languages():
+            captions = self.get_captions(lang)
+            for caption in captions:
+                # strip layout info from caption
+                caption.layout_info = None
+
+                # strip layout info from all nodes in caption
+                for node in caption.nodes:
+                    node.layout_info = None
+
+    @staticmethod
+    def _group_captions_by_start_time(caps: CaptionList) -> list[list[Caption]]:
+        """
+        Groups captions that have the same start time.
+
+        :param caps: CaptionList of captions to group
+        :return: List of lists of captions, where each inner list contains captions with the same start time.
+        """
+
+        caps_start_time = OrderedDict()
+        for i, cap in enumerate(caps):
+            if cap.start not in caps_start_time:
+                caps_start_time[cap.start] = [cap]
+            else:
+                caps_start_time[cap.start].append(cap)
+
+        # order by start timestamp
+        caps_start_time = OrderedDict(sorted(caps_start_time.items(), key=lambda item: item[0]))
+
+        # check if captions with the same start time also have the same end time
+        # fail if different end times are found - this is not (yet?) supported
+        caps_final = []
+        for start_time, caps_list in caps_start_time.items():
+            if len(caps_list) == 1:
+                caps_final.append(caps_list)
+            else:
+                end_times = list(set([c.end for c in caps_list]))
+                if len(end_times) != 1:
+                    raise ValueError("Unsupported subtitles - overlapping subtitles with different end times found")
+                else:
+                    caps_final.append(caps_list)
+        return caps_final
+
+    def make_sure_of_sane_start_times_and_gap(self, min_sub_gap_ms=250):
+        """
+        Makes sure that the start of a caption is not identical to end of the previous one
+        and that there is a minimum gap between captions.
+        :param min_sub_gap_ms: minimum gap in milliseconds that should be between captions
+        """
+        for lang in self.get_languages():
+            _captions = self.get_captions(lang)
+            _captions_by_start = self._group_captions_by_start_time(_captions)
+
+            for i, caps in enumerate(_captions_by_start):
+                # skip the first caption, as it has no previous caption to compare to
+                if i == 0:
+                    continue
+
+                prev_caption_end = _captions_by_start[i - 1][0].end
+                curr_caption_start = caps[0].start
+                curr_caption_end = caps[0].end
+
+                if curr_caption_start < prev_caption_end:
+                    for c in caps:
+                        c.start = min(prev_caption_end + (min_sub_gap_ms * 1000), c.end)
+                elif curr_caption_start == prev_caption_end:
+                    for c in caps:
+                        c.start = min(prev_caption_end + (min_sub_gap_ms * 1000), curr_caption_end)
+
+    def merge_captions(self, merge_layout_info=False):
+        """
+        Merge captions that have the same start and end time.
+        We do this by merging their nodes together, separating them with a line break.
+        """
+        for lang in self.get_languages():
+            captions_raw = self.get_captions(lang)
+            _captions_by_start = self._group_captions_by_start_time(captions_raw)
+
+            all_captions_with_same_time = [l for l in _captions_by_start if len(l) > 1]
+            for current_captions_with_same_time in all_captions_with_same_time:
+                nodes_to_append = [CaptionNode(CaptionNode.BREAK)]
+                for dupe_caption in current_captions_with_same_time[1:]:
+                    nodes_to_append.extend(dupe_caption.nodes)
+                    nodes_to_append.append(CaptionNode(CaptionNode.BREAK))
+                    captions_raw.remove(dupe_caption)
+
+                if len(nodes_to_append) > 0:
+                    if nodes_to_append[-1].type_ == CaptionNode.BREAK:
+                        nodes_to_append.pop()
+
+                if nodes_to_append:
+                    current_caption = current_captions_with_same_time[0]
+                    current_caption.nodes.extend(nodes_to_append)
+                    if merge_layout_info:
+                        layout_info = current_caption.layout_info
+                        if not layout_info:
+                            for node in current_caption.nodes:
+                                if node.type_ == CaptionNode.TEXT:
+                                    layout_info = node.layout_info
+                                    if layout_info:
+                                        break
+                        if not layout_info:
+                            return
+
+                        current_caption.layout_info = layout_info
+                        for node in current_captions_with_same_time[0].nodes:
+                            node.layout_info = layout_info
 
 # Functions
 def merge_concurrent_captions(caption_set):

diff --git a/pycaption/scc/__init__.py b/pycaption/scc/__init__.py
@@ -236,7 +236,7 @@ def detect(self, content):
         else:
             return False
 
-    def read(self, content, lang="en-US", simulate_roll_up=False, offset=0, merge_captions=False):
+    def read(self, content, lang="en-US", simulate_roll_up=False, offset=0):
         """Converts the unicode string into a CaptionSet
 
         :type content: str
@@ -253,11 +253,6 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0, merge_ca
         :type offset: int
         :param offset:
 
-        :type merge_captions: bool
-        :param merge_captions: If True, we will merge captions that have the same
-            start and end time. We do this by merging their nodes together, separating
-            them with a line break.
-
         :rtype: CaptionSet
         """
         if not isinstance(content, str):
@@ -275,19 +270,6 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0, merge_ca
         self._flush_implicit_buffers(self.buffer_dict.active_key)
 
         captions_raw = self.caption_stash.get_all()
-        if merge_captions:
-            _captions_by_start = self._group_captions_by_start_time(captions_raw)
-
-            all_captions_with_same_time = [l for l in _captions_by_start if len(l) > 1]
-            for current_captions_with_same_time in all_captions_with_same_time:
-                nodes_to_append = [CaptionNode(CaptionNode.BREAK)]
-                for dupe_caption in current_captions_with_same_time[1:]:
-                    nodes_to_append.extend(dupe_caption.nodes)
-                    nodes_to_append.append(CaptionNode(CaptionNode.BREAK))
-                    captions_raw.remove(dupe_caption)
-
-                current_captions_with_same_time[0].nodes.extend(nodes_to_append)
-
         captions = CaptionSet({lang: captions_raw})
 
         # check captions for incorrect lengths