From a473574d9b64a755c3172fd57aea27f0d89a8ad5 Mon Sep 17 00:00:00 2001
From: Andrej Peterka <andrej.peterka@castlabs.com>
Date: Thu, 19 Jun 2025 14:01:56 +0200
Subject: [PATCH 1/7] Improve merging of SCC subs

---
 pycaption/scc/__init__.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pycaption/scc/__init__.py b/pycaption/scc/__init__.py
index 0d3a3ff1..25c0b37c 100644
--- a/pycaption/scc/__init__.py
+++ b/pycaption/scc/__init__.py
@@ -286,7 +286,12 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0, merge_ca
                     nodes_to_append.append(CaptionNode(CaptionNode.BREAK))
                     captions_raw.remove(dupe_caption)
 
-                current_captions_with_same_time[0].nodes.extend(nodes_to_append)
+                if len(nodes_to_append) > 0:
+                    if nodes_to_append[-1].type_ == CaptionNode.BREAK:
+                        nodes_to_append.pop()
+
+                if nodes_to_append:
+                    current_captions_with_same_time[0].nodes.extend(nodes_to_append)
 
         captions = CaptionSet({lang: captions_raw})
 

From a7a8d1e452c870736259bf5acdfca8c63b06ae1d Mon Sep 17 00:00:00 2001
From: Andrej Peterka <andrej.peterka@castlabs.com>
Date: Thu, 19 Jun 2025 14:21:01 +0200
Subject: [PATCH 2/7] Add a function to remove layout info from captions

---
 pycaption/base.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pycaption/base.py b/pycaption/base.py
index e55b39b4..8c3ca163 100644
--- a/pycaption/base.py
+++ b/pycaption/base.py
@@ -415,6 +415,15 @@ def remove_empty_captions(self):
                     out_captions.append(caption)
             self.set_captions(lang, out_captions)
 
+    def remove_layout_info(self):
+        for lang in self.get_languages():
+            captions = self.get_captions(lang)
+            for caption in captions:
+                caption.layout_info = None
+                for node in caption.nodes:
+                    node.layout_info = None
+
+
 
 # Functions
 def merge_concurrent_captions(caption_set):

From 28347c6e89cf235611b6cc5219272159f3386bf8 Mon Sep 17 00:00:00 2001
From: Andrej Peterka <andrej.peterka@castlabs.com>
Date: Thu, 19 Jun 2025 15:48:45 +0200
Subject: [PATCH 3/7] Move some helper functions to pycaptions core

---
 pycaption/base.py | 59 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/pycaption/base.py b/pycaption/base.py
index 8c3ca163..b706af06 100644
--- a/pycaption/base.py
+++ b/pycaption/base.py
@@ -1,6 +1,7 @@
 import os
 import re
 from collections import defaultdict
+from collections import OrderedDict
 from datetime import timedelta
 from numbers import Number
 
@@ -423,6 +424,64 @@ def remove_layout_info(self):
                 for node in caption.nodes:
                     node.layout_info = None
 
+    @staticmethod
+    def _group_captions_by_start_time(caps: CaptionList):
+        """
+        Groups captions that have the same start time.
+
+        :param caps:
+        :return:
+        """
+
+        caps_start_time = OrderedDict()
+        for i, cap in enumerate(caps):
+            if cap.start not in caps_start_time:
+                caps_start_time[cap.start] = [cap]
+            else:
+                caps_start_time[cap.start].append(cap)
+
+        # order by start timestamp
+        caps_start_time = OrderedDict(sorted(caps_start_time.items(), key=lambda item: item[0]))
+
+        # check if captions with the same start time also have the same end time
+        # fail if different end times are found - this is not (yet?) supported
+        caps_final = []
+        for start_time, caps_list in caps_start_time.items():
+            if len(caps_list) == 1:
+                caps_final.append(caps_list)
+            else:
+                end_times = list(set([c.end for c in caps_list]))
+                if len(end_times) != 1:
+                    raise ValueError("Unsupported subtitles - overlapping subtitles with different end times found")
+                else:
+                    caps_final.append(caps_list)
+        return caps_final
+
+    def make_sure_of_sane_start_times_and_gap(self, min_sub_gap_ms=250):
+        """
+        Makes sure that the start of a caption is not identical to end of the previous one
+        and that there is a minimum gap between captions.
+        :param min_sub_gap_ms:
+        :return:
+        """
+        for lang in self.get_languages():
+            _captions = self.get_captions(lang)
+            _captions_by_start = self._group_captions_by_start_time(_captions)
+
+            for i, caps in enumerate(_captions_by_start):
+                if i == 0:
+                    continue
+
+                prev_caption_end = _captions_by_start[i - 1][0].end
+                curr_caption_start = caps[0].start
+                curr_caption_end = caps[0].end
+
+                if curr_caption_start < prev_caption_end:
+                    for c in caps:
+                        c.start = min(prev_caption_end + (min_sub_gap_ms * 1000), c.end)
+                elif curr_caption_start == prev_caption_end:
+                    for c in caps:
+                        c.start = min(prev_caption_end + (min_sub_gap_ms * 1000), curr_caption_end)
 
 
 # Functions

From 5c078c184d6f1784d30f6b8c450e1650d1f5c2bb Mon Sep 17 00:00:00 2001
From: Andrej Peterka <andrej.peterka@castlabs.com>
Date: Thu, 19 Jun 2025 15:58:11 +0200
Subject: [PATCH 4/7] Move merging of captions from SCC reader to CaptionList

---
 pycaption/base.py         | 23 +++++++++++++++++++++++
 pycaption/scc/__init__.py | 25 +------------------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/pycaption/base.py b/pycaption/base.py
index b706af06..fe3e7093 100644
--- a/pycaption/base.py
+++ b/pycaption/base.py
@@ -483,6 +483,29 @@ def make_sure_of_sane_start_times_and_gap(self, min_sub_gap_ms=250):
                     for c in caps:
                         c.start = min(prev_caption_end + (min_sub_gap_ms * 1000), curr_caption_end)
 
+    def merge_captions(self):
+        """
+        Merge captions that have the same start and end time.
+        We do this by merging their nodes together, separating them with a line break.
+        """
+        for lang in self.get_languages():
+            captions_raw = self.get_captions(lang)
+            _captions_by_start = self._group_captions_by_start_time(captions_raw)
+
+            all_captions_with_same_time = [l for l in _captions_by_start if len(l) > 1]
+            for current_captions_with_same_time in all_captions_with_same_time:
+                nodes_to_append = [CaptionNode(CaptionNode.BREAK)]
+                for dupe_caption in current_captions_with_same_time[1:]:
+                    nodes_to_append.extend(dupe_caption.nodes)
+                    nodes_to_append.append(CaptionNode(CaptionNode.BREAK))
+                    captions_raw.remove(dupe_caption)
+
+                if len(nodes_to_append) > 0:
+                    if nodes_to_append[-1].type_ == CaptionNode.BREAK:
+                        nodes_to_append.pop()
+
+                if nodes_to_append:
+                    current_captions_with_same_time[0].nodes.extend(nodes_to_append)
 
 # Functions
 def merge_concurrent_captions(caption_set):
diff --git a/pycaption/scc/__init__.py b/pycaption/scc/__init__.py
index 25c0b37c..6bd81462 100644
--- a/pycaption/scc/__init__.py
+++ b/pycaption/scc/__init__.py
@@ -236,7 +236,7 @@ def detect(self, content):
         else:
             return False
 
-    def read(self, content, lang="en-US", simulate_roll_up=False, offset=0, merge_captions=False):
+    def read(self, content, lang="en-US", simulate_roll_up=False, offset=0):
         """Converts the unicode string into a CaptionSet
 
         :type content: str
@@ -253,11 +253,6 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0, merge_ca
         :type offset: int
         :param offset:
 
-        :type merge_captions: bool
-        :param merge_captions: If True, we will merge captions that have the same
-            start and end time. We do this by merging their nodes together, separating
-            them with a line break.
-
         :rtype: CaptionSet
         """
         if not isinstance(content, str):
@@ -275,24 +270,6 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0, merge_ca
         self._flush_implicit_buffers(self.buffer_dict.active_key)
 
         captions_raw = self.caption_stash.get_all()
-        if merge_captions:
-            _captions_by_start = self._group_captions_by_start_time(captions_raw)
-
-            all_captions_with_same_time = [l for l in _captions_by_start if len(l) > 1]
-            for current_captions_with_same_time in all_captions_with_same_time:
-                nodes_to_append = [CaptionNode(CaptionNode.BREAK)]
-                for dupe_caption in current_captions_with_same_time[1:]:
-                    nodes_to_append.extend(dupe_caption.nodes)
-                    nodes_to_append.append(CaptionNode(CaptionNode.BREAK))
-                    captions_raw.remove(dupe_caption)
-
-                if len(nodes_to_append) > 0:
-                    if nodes_to_append[-1].type_ == CaptionNode.BREAK:
-                        nodes_to_append.pop()
-
-                if nodes_to_append:
-                    current_captions_with_same_time[0].nodes.extend(nodes_to_append)
-
         captions = CaptionSet({lang: captions_raw})
 
         # check captions for incorrect lengths

From 35fe3d1f9227868a52c9bb627f8aa9272e2060ad Mon Sep 17 00:00:00 2001
From: Andrej Peterka <andrej.peterka@castlabs.com>
Date: Fri, 20 Jun 2025 08:51:24 +0200
Subject: [PATCH 5/7] Enable merging layout when merging subs

---
 pycaption/base.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/pycaption/base.py b/pycaption/base.py
index fe3e7093..e12dacb0 100644
--- a/pycaption/base.py
+++ b/pycaption/base.py
@@ -483,7 +483,7 @@ def make_sure_of_sane_start_times_and_gap(self, min_sub_gap_ms=250):
                     for c in caps:
                         c.start = min(prev_caption_end + (min_sub_gap_ms * 1000), curr_caption_end)
 
-    def merge_captions(self):
+    def merge_captions(self, merge_layout_info=False):
         """
         Merge captions that have the same start and end time.
         We do this by merging their nodes together, separating them with a line break.
@@ -505,7 +505,22 @@ def merge_captions(self):
                         nodes_to_append.pop()
 
                 if nodes_to_append:
-                    current_captions_with_same_time[0].nodes.extend(nodes_to_append)
+                    current_caption = current_captions_with_same_time[0]
+                    current_caption.nodes.extend(nodes_to_append)
+                    if merge_layout_info:
+                        layout_info = current_caption.layout_info
+                        if not layout_info:
+                            for node in current_caption.nodes:
+                                if node.type_ == CaptionNode.TEXT:
+                                    layout_info = node.layout_info
+                                    if layout_info:
+                                        break
+                        if not layout_info:
+                            return
+
+                        current_caption.layout_info = layout_info
+                        for node in current_captions_with_same_time[0].nodes:
+                            node.layout_info = layout_info
 
 # Functions
 def merge_concurrent_captions(caption_set):

From ebfec97025c300fd63f99ebc9c76db8ae942dd5d Mon Sep 17 00:00:00 2001
From: Andrej Peterka <andrej.peterka@castlabs.com>
Date: Fri, 20 Jun 2025 14:35:06 +0200
Subject: [PATCH 6/7] Add comments

---
 pycaption/base.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/pycaption/base.py b/pycaption/base.py
index e12dacb0..68e81c3b 100644
--- a/pycaption/base.py
+++ b/pycaption/base.py
@@ -383,6 +383,9 @@ def adjust_caption_timing(self, offset=0, rate_skew=1.0):
             self.set_captions(lang, out_captions)
 
     def strip_html_tags(self):
+        """
+        Iterates all captions and nodes in all languages and strips HTML tags (matching the RE_HTML_STRIP regex)
+        """
         for lang in self.get_languages():
             captions = self.get_captions(lang)
             out_captions = CaptionList()
@@ -394,6 +397,9 @@ def strip_html_tags(self):
             self.set_captions(lang, out_captions)
 
     def strip_ass_tags(self):
+        """
+        Iterates all captions and nodes in all languages and strips ASS tags (matching the RE_ASS_STRIP regex)
+        """
         for lang in self.get_languages():
             captions = self.get_captions(lang)
             out_captions = CaptionList()
@@ -405,6 +411,9 @@ def strip_ass_tags(self):
             self.set_captions(lang, out_captions)
 
     def remove_empty_captions(self):
+        """
+        Removes captions which have only empty TEXT nodes.
+        """
         for lang in self.get_languages():
             captions = self.get_captions(lang)
             out_captions = CaptionList()
@@ -417,6 +426,9 @@ def remove_empty_captions(self):
             self.set_captions(lang, out_captions)
 
     def remove_layout_info(self):
+        """
+        Removes layout info from all captions and nodes in all languages.
+        """
         for lang in self.get_languages():
             captions = self.get_captions(lang)
             for caption in captions:
@@ -425,12 +437,12 @@ def remove_layout_info(self):
                     node.layout_info = None
 
     @staticmethod
-    def _group_captions_by_start_time(caps: CaptionList):
+    def _group_captions_by_start_time(caps: CaptionList) -> list[list[Caption]]:
         """
         Groups captions that have the same start time.
 
-        :param caps:
-        :return:
+        :param caps: CaptionList of captions to group
+        :return: List of lists of captions, where each inner list contains captions with the same start time.
         """
 
         caps_start_time = OrderedDict()
@@ -461,8 +473,7 @@ def make_sure_of_sane_start_times_and_gap(self, min_sub_gap_ms=250):
         """
         Makes sure that the start of a caption is not identical to end of the previous one
         and that there is a minimum gap between captions.
-        :param min_sub_gap_ms:
-        :return:
+        :param min_sub_gap_ms: minimum gap in milliseconds that should be between captions
         """
         for lang in self.get_languages():
             _captions = self.get_captions(lang)

From 319b81b9f183fc3659976fdfe9be5f6efe6abed4 Mon Sep 17 00:00:00 2001
From: Andrej Peterka <andrej.peterka@castlabs.com>
Date: Fri, 20 Jun 2025 14:41:46 +0200
Subject: [PATCH 7/7] comments

---
 pycaption/base.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pycaption/base.py b/pycaption/base.py
index 68e81c3b..0b3c27f1 100644
--- a/pycaption/base.py
+++ b/pycaption/base.py
@@ -432,7 +432,10 @@ def remove_layout_info(self):
         for lang in self.get_languages():
             captions = self.get_captions(lang)
             for caption in captions:
+                # strip layout info from caption
                 caption.layout_info = None
+
+                # strip layout info from all nodes in caption
                 for node in caption.nodes:
                     node.layout_info = None
 
@@ -480,6 +483,7 @@ def make_sure_of_sane_start_times_and_gap(self, min_sub_gap_ms=250):
             _captions_by_start = self._group_captions_by_start_time(_captions)
 
             for i, caps in enumerate(_captions_by_start):
+                # skip the first caption, as it has no previous caption to compare to
                 if i == 0:
                     continue