diff --git a/Lib/test/multibytecodec_support.py b/Lib/test/multibytecodec_support.py index dbf0cc428e3ff6..6b4c57d0b4bad7 100644 --- a/Lib/test/multibytecodec_support.py +++ b/Lib/test/multibytecodec_support.py @@ -282,6 +282,23 @@ def test_incrementalencoder_del_segfault(self): with self.assertRaises(AttributeError): del e.errors + def test_null_terminator(self): + # see gh-101828 + text = "フルーツ" + try: + text.encode(self.encoding) + except UnicodeEncodeError: + text = "Python is cool" + encode_w_null = (text + "\0").encode(self.encoding) + encode_plus_null = text.encode(self.encoding) + "\0".encode(self.encoding) + self.assertTrue(encode_w_null.endswith(b'\x00')) + self.assertEqual(encode_w_null, encode_plus_null) + + encode_w_null_2 = (text + "\0" + text + "\0").encode(self.encoding) + encode_plus_null_2 = encode_plus_null + encode_plus_null + self.assertEqual(encode_w_null_2.count(b'\x00'), 2) + self.assertEqual(encode_w_null_2, encode_plus_null_2) + class TestBase_Mapping(unittest.TestCase): pass_enctest = [] diff --git a/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst b/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst new file mode 100644 index 00000000000000..1d100180c072ec --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-09-25-20-16-10.gh-issue-101828.yTxJlJ.rst @@ -0,0 +1,3 @@ +Fix ``'shift_jisx0213'``, ``'shift_jis_2004'``, ``'euc_jisx0213'`` and +``'euc_jis_2004'`` codecs truncating null chars +as they were treated as part of multi-character sequences. diff --git a/Modules/cjkcodecs/_codecs_iso2022.c b/Modules/cjkcodecs/_codecs_iso2022.c index ef6faeb71274e1..b1984df2695b17 100644 --- a/Modules/cjkcodecs/_codecs_iso2022.c +++ b/Modules/cjkcodecs/_codecs_iso2022.c @@ -802,10 +802,13 @@ jisx0213_encoder(const MultibyteCodec *codec, const Py_UCS4 *data, return coded; case 2: /* second character of unicode pair */ - coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1], - jisx0213_pair_encmap, JISX0213_ENCPAIRS); - if (coded != DBCINV) - return coded; + if (data[1] != 0) { /* Don't consume null char as part of pair */ + coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1], + jisx0213_pair_encmap, JISX0213_ENCPAIRS); + if (coded != DBCINV) { + return coded; + } + } _Py_FALLTHROUGH; case -1: /* flush unterminated */ diff --git a/Modules/cjkcodecs/_codecs_jp.c b/Modules/cjkcodecs/_codecs_jp.c index f7127487aa5f59..cd77888d5514b8 100644 --- a/Modules/cjkcodecs/_codecs_jp.c +++ b/Modules/cjkcodecs/_codecs_jp.c @@ -192,8 +192,11 @@ ENCODER(euc_jis_2004) JISX0213_ENCPAIRS); if (code == DBCINV) return 1; - } else + } + else if (c2 != 0) { + /* Don't consume null char as part of pair */ insize = 2; + } } } } @@ -611,8 +614,10 @@ ENCODER(shift_jis_2004) if (code == DBCINV) return 1; } - else + else if (ch2 != 0) { + /* Don't consume null char as part of pair */ insize = 2; + } } } }