From 638e587422ccc08c4ca4af204fc19cbe5688d0bf Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Thu, 25 Sep 2025 20:43:08 +0100 Subject: [PATCH 1/4] Commit --- Lib/test/test_codecencodings_jp.py | 41 +++++++++++++++++++ ...-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst | 2 + Modules/cjkcodecs/_codecs_jp.c | 13 +++++- 3 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst diff --git a/Lib/test/test_codecencodings_jp.py b/Lib/test/test_codecencodings_jp.py index 94378d124f7485..48d9f73db99398 100644 --- a/Lib/test/test_codecencodings_jp.py +++ b/Lib/test/test_codecencodings_jp.py @@ -106,6 +106,27 @@ class Test_SJIS_2004(multibytecodec_support.TestBase, unittest.TestCase): b"\x85Gℜ\x85Q = ⟨ሴ⟩" ) + def test_null_terminator(self): + # see gh-101828 + cases = ( + "バルーンフルーツ", + "ライフアップキノコ", + "テスト", + "'Tis but a scratch!" + ) + for case in cases: + with self.subTest(case=case): + encode_w_null = (case + "\0").encode(self.encoding) + encode_plus_null = case.encode(self.encoding) + "\0".encode(self.encoding) + self.assertTrue(encode_w_null.endswith(b'\x00')) + self.assertEqual(encode_w_null, encode_plus_null) + + encode_w_null_2 = encode_w_null + encode_w_null + encode_plus_null_2 = encode_plus_null + encode_plus_null + self.assertEqual(encode_w_null_2.count(b'\x00'), 2) + self.assertEqual(encode_w_null_2, encode_plus_null_2) + + class Test_SJISX0213(multibytecodec_support.TestBase, unittest.TestCase): encoding = 'shift_jisx0213' tstring = multibytecodec_support.load_teststring('shift_jisx0213') @@ -121,6 +142,26 @@ class Test_SJISX0213(multibytecodec_support.TestBase, unittest.TestCase): "\xab\u211c\xbb = \u2329\u1234\u232a", b"\x85Gℜ\x85Q = ⟨ሴ⟩" ) + def test_null_terminator(self): + # see gh-101828 + cases = ( + "バルーンフルーツ", + "ライフアップキノコ", + "テスト", + "'Tis but a scratch!" + ) + for case in cases: + with self.subTest(case=case): + encode_w_null = (case + "\0").encode(self.encoding) + encode_plus_null = case.encode(self.encoding) + "\0".encode(self.encoding) + self.assertTrue(encode_w_null.endswith(b'\x00')) + self.assertEqual(encode_w_null, encode_plus_null) + + encode_w_null_2 = encode_w_null + encode_w_null + encode_plus_null_2 = encode_plus_null + encode_plus_null + self.assertEqual(encode_w_null_2.count(b'\x00'), 2) + self.assertEqual(encode_w_null_2, encode_plus_null_2) + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst b/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst new file mode 100644 index 00000000000000..942bc61698b4d3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst @@ -0,0 +1,2 @@ +Fix ``'shift_jisx0213'`` and ``'shift_jis_2004'`` codecs truncating null char +as it was treated as part of a multi-character sequence. diff --git a/Modules/cjkcodecs/_codecs_jp.c b/Modules/cjkcodecs/_codecs_jp.c index f7127487aa5f59..04b88a04305b51 100644 --- a/Modules/cjkcodecs/_codecs_jp.c +++ b/Modules/cjkcodecs/_codecs_jp.c @@ -611,8 +611,19 @@ ENCODER(shift_jis_2004) if (code == DBCINV) return 1; } - else + else if (ch2 != 0) { insize = 2; + } + else { + /* Don't consume null char as part of pair */ + code = find_pairencmap( + (ucs2_t)c, 0, + jisx0213_pair_encmap, + JISX0213_ENCPAIRS); + if (code == DBCINV) { + return 1; + } + } } } } From c565d52194bb3eef64a8f2b436e9fad0df85125a Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Thu, 25 Sep 2025 20:45:52 +0100 Subject: [PATCH 2/4] Commit --- Lib/test/test_codecencodings_jp.py | 3 +-- .../Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_codecencodings_jp.py b/Lib/test/test_codecencodings_jp.py index 48d9f73db99398..0cdfa0d13eb4a7 100644 --- a/Lib/test/test_codecencodings_jp.py +++ b/Lib/test/test_codecencodings_jp.py @@ -126,7 +126,6 @@ def test_null_terminator(self): self.assertEqual(encode_w_null_2.count(b'\x00'), 2) self.assertEqual(encode_w_null_2, encode_plus_null_2) - class Test_SJISX0213(multibytecodec_support.TestBase, unittest.TestCase): encoding = 'shift_jisx0213' tstring = multibytecodec_support.load_teststring('shift_jisx0213') @@ -142,6 +141,7 @@ class Test_SJISX0213(multibytecodec_support.TestBase, unittest.TestCase): "\xab\u211c\xbb = \u2329\u1234\u232a", b"\x85Gℜ\x85Q = ⟨ሴ⟩" ) + def test_null_terminator(self): # see gh-101828 cases = ( @@ -162,6 +162,5 @@ def test_null_terminator(self): self.assertEqual(encode_w_null_2.count(b'\x00'), 2) self.assertEqual(encode_w_null_2, encode_plus_null_2) - if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst b/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst index 942bc61698b4d3..9fe961f731493c 100644 --- a/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst +++ b/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst @@ -1,2 +1,2 @@ -Fix ``'shift_jisx0213'`` and ``'shift_jis_2004'`` codecs truncating null char -as it was treated as part of a multi-character sequence. +Fix ``'shift_jisx0213'`` and ``'shift_jis_2004'`` codecs truncating null chars +as they were treated as part of multi-character sequences. From adb384bf978df054882c9c0428472f1e93f83ec3 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 6 Oct 2025 17:12:34 +0100 Subject: [PATCH 3/4] Address Inada review + refactor test location --- Lib/test/multibytecodec_support.py | 16 ++++++++ Lib/test/test_codecencodings_jp.py | 40 ------------------- ...-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst | 3 +- Modules/cjkcodecs/_codecs_jp.c | 16 +++----- 4 files changed, 23 insertions(+), 52 deletions(-) diff --git a/Lib/test/multibytecodec_support.py b/Lib/test/multibytecodec_support.py index dbf0cc428e3ff6..205ac8bfc8698d 100644 --- a/Lib/test/multibytecodec_support.py +++ b/Lib/test/multibytecodec_support.py @@ -282,6 +282,22 @@ def test_incrementalencoder_del_segfault(self): with self.assertRaises(AttributeError): del e.errors + def test_null_terminator(self): + # see gh-101828 + if any(enc in self.encoding for enc in ('shift', 'euc_jis')): + text = "バルーンフルーツ" + else: + text = "Spam" + encode_w_null = (text + "\0").encode(self.encoding) + encode_plus_null = text.encode(self.encoding) + "\0".encode(self.encoding) + self.assertTrue(encode_w_null.endswith(b'\x00')) + self.assertEqual(encode_w_null, encode_plus_null) + + encode_w_null_2 = (text + "\0" + text + "\0").encode(self.encoding) + encode_plus_null_2 = encode_plus_null + encode_plus_null + self.assertEqual(encode_w_null_2.count(b'\x00'), 2) + self.assertEqual(encode_w_null_2, encode_plus_null_2) + class TestBase_Mapping(unittest.TestCase): pass_enctest = [] diff --git a/Lib/test/test_codecencodings_jp.py b/Lib/test/test_codecencodings_jp.py index 0cdfa0d13eb4a7..94378d124f7485 100644 --- a/Lib/test/test_codecencodings_jp.py +++ b/Lib/test/test_codecencodings_jp.py @@ -106,26 +106,6 @@ class Test_SJIS_2004(multibytecodec_support.TestBase, unittest.TestCase): b"\x85Gℜ\x85Q = ⟨ሴ⟩" ) - def test_null_terminator(self): - # see gh-101828 - cases = ( - "バルーンフルーツ", - "ライフアップキノコ", - "テスト", - "'Tis but a scratch!" - ) - for case in cases: - with self.subTest(case=case): - encode_w_null = (case + "\0").encode(self.encoding) - encode_plus_null = case.encode(self.encoding) + "\0".encode(self.encoding) - self.assertTrue(encode_w_null.endswith(b'\x00')) - self.assertEqual(encode_w_null, encode_plus_null) - - encode_w_null_2 = encode_w_null + encode_w_null - encode_plus_null_2 = encode_plus_null + encode_plus_null - self.assertEqual(encode_w_null_2.count(b'\x00'), 2) - self.assertEqual(encode_w_null_2, encode_plus_null_2) - class Test_SJISX0213(multibytecodec_support.TestBase, unittest.TestCase): encoding = 'shift_jisx0213' tstring = multibytecodec_support.load_teststring('shift_jisx0213') @@ -142,25 +122,5 @@ class Test_SJISX0213(multibytecodec_support.TestBase, unittest.TestCase): b"\x85Gℜ\x85Q = ⟨ሴ⟩" ) - def test_null_terminator(self): - # see gh-101828 - cases = ( - "バルーンフルーツ", - "ライフアップキノコ", - "テスト", - "'Tis but a scratch!" - ) - for case in cases: - with self.subTest(case=case): - encode_w_null = (case + "\0").encode(self.encoding) - encode_plus_null = case.encode(self.encoding) + "\0".encode(self.encoding) - self.assertTrue(encode_w_null.endswith(b'\x00')) - self.assertEqual(encode_w_null, encode_plus_null) - - encode_w_null_2 = encode_w_null + encode_w_null - encode_plus_null_2 = encode_plus_null + encode_plus_null - self.assertEqual(encode_w_null_2.count(b'\x00'), 2) - self.assertEqual(encode_w_null_2, encode_plus_null_2) - if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst b/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst index 9fe961f731493c..1d100180c072ec 100644 --- a/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst +++ b/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst @@ -1,2 +1,3 @@ -Fix ``'shift_jisx0213'`` and ``'shift_jis_2004'`` codecs truncating null chars +Fix ``'shift_jisx0213'``, ``'shift_jis_2004'``, ``'euc_jisx0213'`` and +``'euc_jis_2004'`` codecs truncating null chars as they were treated as part of multi-character sequences. diff --git a/Modules/cjkcodecs/_codecs_jp.c b/Modules/cjkcodecs/_codecs_jp.c index 04b88a04305b51..cd77888d5514b8 100644 --- a/Modules/cjkcodecs/_codecs_jp.c +++ b/Modules/cjkcodecs/_codecs_jp.c @@ -192,8 +192,11 @@ ENCODER(euc_jis_2004) JISX0213_ENCPAIRS); if (code == DBCINV) return 1; - } else + } + else if (c2 != 0) { + /* Don't consume null char as part of pair */ insize = 2; + } } } } @@ -612,17 +615,8 @@ ENCODER(shift_jis_2004) return 1; } else if (ch2 != 0) { - insize = 2; - } - else { /* Don't consume null char as part of pair */ - code = find_pairencmap( - (ucs2_t)c, 0, - jisx0213_pair_encmap, - JISX0213_ENCPAIRS); - if (code == DBCINV) { - return 1; - } + insize = 2; } } } From d9f910caaa6765383b3031f298a64406f49f9e5e Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Tue, 7 Oct 2025 17:37:18 +0100 Subject: [PATCH 4/4] Commit --- Lib/test/multibytecodec_support.py | 9 +++++---- Modules/cjkcodecs/_codecs_iso2022.c | 11 +++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/Lib/test/multibytecodec_support.py b/Lib/test/multibytecodec_support.py index 205ac8bfc8698d..6b4c57d0b4bad7 100644 --- a/Lib/test/multibytecodec_support.py +++ b/Lib/test/multibytecodec_support.py @@ -284,10 +284,11 @@ def test_incrementalencoder_del_segfault(self): def test_null_terminator(self): # see gh-101828 - if any(enc in self.encoding for enc in ('shift', 'euc_jis')): - text = "バルーンフルーツ" - else: - text = "Spam" + text = "フルーツ" + try: + text.encode(self.encoding) + except UnicodeEncodeError: + text = "Python is cool" encode_w_null = (text + "\0").encode(self.encoding) encode_plus_null = text.encode(self.encoding) + "\0".encode(self.encoding) self.assertTrue(encode_w_null.endswith(b'\x00')) diff --git a/Modules/cjkcodecs/_codecs_iso2022.c b/Modules/cjkcodecs/_codecs_iso2022.c index ef6faeb71274e1..b1984df2695b17 100644 --- a/Modules/cjkcodecs/_codecs_iso2022.c +++ b/Modules/cjkcodecs/_codecs_iso2022.c @@ -802,10 +802,13 @@ jisx0213_encoder(const MultibyteCodec *codec, const Py_UCS4 *data, return coded; case 2: /* second character of unicode pair */ - coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1], - jisx0213_pair_encmap, JISX0213_ENCPAIRS); - if (coded != DBCINV) - return coded; + if (data[1] != 0) { /* Don't consume null char as part of pair */ + coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1], + jisx0213_pair_encmap, JISX0213_ENCPAIRS); + if (coded != DBCINV) { + return coded; + } + } _Py_FALLTHROUGH; case -1: /* flush unterminated */