From 3e515824bf57615607cd14b9d89d9739a71e0c1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20=C5=BB=C3=B3=C5=82tak?= Date: Fri, 6 Dec 2024 18:23:51 +0100 Subject: [PATCH 1/6] Tar: add support for large and negative numbers In 2001 the GNU tar introduced support for large and negative numbers (https://www.gnu.org/software/tar/manual/html_node/Extensions.html#Extensions) This is required to handle files bigger than 8G. --- src/Tar.php | 67 ++++++++++++++++++++++++++++++++++++++++--- tests/TarTestCase.php | 26 +++++++++++++++++ 2 files changed, 89 insertions(+), 4 deletions(-) diff --git a/src/Tar.php b/src/Tar.php index 463880b..e2be011 100644 --- a/src/Tar.php +++ b/src/Tar.php @@ -553,8 +553,8 @@ protected function writeRawFileHeader($name, $uid, $gid, $perm, $size, $mtime, $ $uid = sprintf("%6s ", decoct($uid)); $gid = sprintf("%6s ", decoct($gid)); $perm = sprintf("%6s ", decoct($perm)); - $size = sprintf("%11s ", decoct($size)); - $mtime = sprintf("%11s", decoct($mtime)); + $size = self::numberEncode($size, 12); + $mtime = self::numberEncode($size, 12); $data_first = pack("a100a8a8a8a12A12", $name, $perm, $uid, $gid, $size, $mtime); $data_last = pack("a1a100a6a2a32a32a8a8a155a12", $typeflag, '', 'ustar', '', '', '', '', '', $prefix, ""); @@ -614,8 +614,8 @@ protected function parseHeader($block) $return['perm'] = OctDec(trim($header['perm'])); $return['uid'] = OctDec(trim($header['uid'])); $return['gid'] = OctDec(trim($header['gid'])); - $return['size'] = OctDec(trim($header['size'])); - $return['mtime'] = OctDec(trim($header['mtime'])); + $return['size'] = self::numberDecode($header['size']); + $return['mtime'] = self::numberDecode($header['mtime']); $return['typeflag'] = $header['typeflag']; $return['link'] = trim($header['link']); $return['uname'] = trim($header['uname']); @@ -713,4 +713,63 @@ public function filetype($file) return Archive::COMPRESS_NONE; } + /** + * Decodes numeric values according to the + * https://www.gnu.org/software/tar/manual/html_node/Extensions.html#Extensions + * (basically with support for big numbers) + * + * @param string $field + * $return int + */ + static public function numberDecode($field) + { + $firstByte = ord(substr($field, 0, 1)); + if ($firstByte === 255) { + $value = -1 << (8 * strlen($field)); + $shift = 0; + for ($i = strlen($field) - 1; $i >= 0; $i--) { + $value += ord(substr($field, $i, 1)) << $shift; + $shift += 8; + } + } elseif ($firstByte === 128) { + $value = 0; + $shift = 0; + for ($i = strlen($field) - 1; $i > 0; $i--) { + $value += ord(substr($field, $i, 1)) << $shift; + $shift += 8; + } + } else { + $value = octdec(trim($field)); + } + return $value; + } + + /** + * Encodes numeric values according to the + * https://www.gnu.org/software/tar/manual/html_node/Extensions.html#Extensions + * (basically with support for big numbers) + * + * @param int $value + * @param int $length field length + * @return string + */ + static public function numberEncode($value, $length) + { + // old implementations leave last byte empty + // octal encoding encodes three bits per byte + $maxValue = 1 << (($length - 1) * 3); + if ($value < 0) { + // PHP already stores integers as 2's complement + $value = pack(PHP_INT_SIZE === 8 ? 'J' : 'N', (int) $value); + $encoded = str_repeat(chr(255), max(1, $length - PHP_INT_SIZE)); + $encoded .= substr($value, max(0, PHP_INT_SIZE - $length + 1)); + } elseif ($value >= $maxValue) { + $value = pack(PHP_INT_SIZE === 8 ? 'J' : 'N', (int) $value); + $encoded = chr(128) . str_repeat(chr(0), max(0, $length - PHP_INT_SIZE - 1)); + $encoded .= substr($value, max(0, PHP_INT_SIZE - $length + 1)); + } else { + $encoded = sprintf("%" . ($length - 1) . "s ", decoct($value)); + } + return $encoded; + } } diff --git a/tests/TarTestCase.php b/tests/TarTestCase.php index 32cdeed..e4de42b 100644 --- a/tests/TarTestCase.php +++ b/tests/TarTestCase.php @@ -778,6 +778,32 @@ public function testSaveWithInvalidDestinationFile() $this->assertTrue(true); // succeed if no exception, yet } + public function testNumberEncodeDecode() + { + // 2^34 + 17 = 2^2 * 2^32 + 17 + $refValue = (1 << 34) + 17; + $encoded = Tar::numberEncode($refValue, 12); + $this->assertEquals(pack('CCnNN', 128, 0, 0, 1 << 2, 17), $encoded); + $decoded = Tar::numberDecode($encoded); + $this->assertEquals($refValue, $decoded); + + $encoded = Tar::numberEncode($refValue, 7); + $this->assertEquals(pack('CnN', 128, 1 << 2, 17), $encoded); + $decoded = Tar::numberDecode($encoded); + $this->assertEquals($refValue, $decoded); + + $refValue = -1234; + $encoded = Tar::numberEncode($refValue, 12); + $this->assertEquals(pack('CCnNN', 0xFF, 0xFF, 0xFFFF, 0xFFFFFFFF, -1234), $encoded); + $decoded = Tar::numberDecode($encoded); + $this->assertEquals($refValue, $decoded); + + $encoded = Tar::numberEncode($refValue, 3); + $this->assertEquals(pack('Cn', 0xFF, -1234), $encoded); + $decoded = Tar::numberDecode($encoded); + $this->assertEquals($refValue, $decoded); + } + /** * recursive rmdir()/unlink() * From 6136d0ab63bddce8cc4508b92a692cf0ed731c80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20=C5=BB=C3=B3=C5=82tak?= Date: Fri, 6 Dec 2024 20:15:23 +0100 Subject: [PATCH 2/6] Tar: allow reading archive content while iterating over archive entries. So far there was no way to read the data from a file in an archive without extracting it and extraction of a single file required rereading of a whole archive. This commit changes the yieldContents() in a way it does not skip to the next header entry before returning a current header content. A position of the next header entry is remembered instead and rewinded to only at the next next() call on the generator. This allows to read the current entry content until the next() call. For that the Tar::readCurrentEntry() method was added. --- src/Tar.php | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/Tar.php b/src/Tar.php index e2be011..52d5800 100644 --- a/src/Tar.php +++ b/src/Tar.php @@ -23,6 +23,8 @@ class Tar extends Archive protected $memory = ''; protected $closed = true; protected $writeaccess = false; + protected $position = 0; + protected $skipUntil = 0; /** * Sets the compression to use @@ -72,6 +74,7 @@ public function open($file) throw new ArchiveIOException('Could not open file for reading: '.$this->file); } $this->closed = false; + $this->position = 0; } /** @@ -118,12 +121,23 @@ public function yieldContents() continue; } - $this->skipbytes(ceil($header['size'] / 512) * 512); + $this->skipUntil = $this->position + ceil($header['size'] / 512) * 512; + yield $this->header2fileinfo($header); + + $skip = $this->skipUntil - $this->position; + if ($skip > 0) { + $this->skipbytes($skip); + } } $this->close(); + } + public function readCurrentEntry($length = PHP_INT_MAX) + { + $length = min($length, $this->skipUntil - $this->position); + return $this->readbytes($length); } /** @@ -439,12 +453,14 @@ public function save($file) protected function readbytes($length) { if ($this->comptype === Archive::COMPRESS_GZIP) { - return @gzread($this->fh, $length); + $ret = @gzread($this->fh, $length); } elseif ($this->comptype === Archive::COMPRESS_BZIP) { - return @bzread($this->fh, $length); + $ret = @bzread($this->fh, $length); } else { - return @fread($this->fh, $length); + $ret = @fread($this->fh, $length); } + $this->position += strlen($ret); + return $ret; } /** @@ -494,6 +510,7 @@ protected function skipbytes($bytes) } else { @fseek($this->fh, $bytes, SEEK_CUR); } + $this->position += $bytes; } /** From f931cad249679e372469f5ca9419d330d00dc672 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20=C5=BB=C3=B3=C5=82tak?= Date: Fri, 6 Dec 2024 21:06:24 +0100 Subject: [PATCH 3/6] Tar::readCurrentEntry(): recognize end of file properly --- src/Tar.php | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/Tar.php b/src/Tar.php index 52d5800..6fba270 100644 --- a/src/Tar.php +++ b/src/Tar.php @@ -24,6 +24,7 @@ class Tar extends Archive protected $closed = true; protected $writeaccess = false; protected $position = 0; + protected $contentUntil = 0; protected $skipUntil = 0; /** @@ -121,6 +122,7 @@ public function yieldContents() continue; } + $this->contentUntil = $this->position + $header['size']; $this->skipUntil = $this->position + ceil($header['size'] / 512) * 512; yield $this->header2fileinfo($header); @@ -134,9 +136,22 @@ public function yieldContents() $this->close(); } + /** + * Reads content of a current archive entry. + * + * Works only when iterating trough the archive using the generator returned + * by the yieldContents(). + * + * @param int $length maximum number of bytes to read + * + * @return string + */ public function readCurrentEntry($length = PHP_INT_MAX) { - $length = min($length, $this->skipUntil - $this->position); + $length = (int) min($length, $this->contentUntil - $this->position); + if ($length === 0) { + return ''; + } return $this->readbytes($length); } @@ -790,3 +805,4 @@ static public function numberEncode($value, $length) return $encoded; } } + From f15ef3a95c7edceba392e3f5bbd479b5fb8035d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20=C5=BB=C3=B3=C5=82tak?= Date: Sat, 7 Dec 2024 09:51:45 +0100 Subject: [PATCH 4/6] Tar::addFile(): use larger read buffer for better performance --- src/Tar.php | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/Tar.php b/src/Tar.php index 6fba270..2ec5bab 100644 --- a/src/Tar.php +++ b/src/Tar.php @@ -319,16 +319,27 @@ public function addFile($file, $fileinfo = '') throw new ArchiveIOException('Could not open file for reading: ' . $file); } while (!feof($fp)) { - $data = fread($fp, 512); - $read += strlen($data); + // try to read 1 MB (512 bytes is unperformant) + $data = fread($fp, 1048576); if ($data === false) { break; } if ($data === '') { break; } - $packed = pack("a512", $data); - $this->writebytes($packed); + $dataLen = strlen($data); + $read += $dataLen; + // how much of data read fully fills 512-byte blocks? + $passLen = ($dataLen >> 9) << 9; + if ($passLen === $dataLen) { + // all - just write the data + $this->writebytes($data); + } else { + // directly write what fills 512-byte blocks fully + $this->writebytes(substr($data, 0, $passLen)); + // pad the reminder to 512 bytes + $this->writebytes(pack("a512", substr($data, $passLen))); + } } fclose($fp); From 5ff390cbd020fc854a8f6bba68191a24a6420364 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20=C5=BB=C3=B3=C5=82tak?= Date: Sat, 7 Dec 2024 10:01:10 +0100 Subject: [PATCH 5/6] Tar: write performance optimizations Tar::addData(): pad only the last block of data and write everything else with just a single writebytes() call and without pack(). Tar::addFile(): move the read chunk size to a class constant. --- src/Tar.php | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/Tar.php b/src/Tar.php index 2ec5bab..ce4b146 100644 --- a/src/Tar.php +++ b/src/Tar.php @@ -15,6 +15,7 @@ */ class Tar extends Archive { + const READ_CHUNK_SIZE = 1048576; // 1MB protected $file = ''; protected $comptype = Archive::COMPRESS_AUTO; @@ -319,8 +320,8 @@ public function addFile($file, $fileinfo = '') throw new ArchiveIOException('Could not open file for reading: ' . $file); } while (!feof($fp)) { - // try to read 1 MB (512 bytes is unperformant) - $data = fread($fp, 1048576); + // for performance reasons read bigger chunks at once + $data = fread($fp, self::READ_CHUNK_SIZE); if ($data === false) { break; } @@ -375,8 +376,11 @@ public function addData($fileinfo, $data) $fileinfo->setSize($len); $this->writeFileHeader($fileinfo); - for ($s = 0; $s < $len; $s += 512) { - $this->writebytes(pack("a512", substr($data, $s, 512))); + // write directly everything but the last block which needs padding + $passLen = ($len >> 9) << 9; + $this->writebytes(substr($data, 0, $passLen)); + if ($passLen < $len) { + $this->writebytes(pack("a512", substr($data, $passLen, 512))); } if (is_callable($this->callback)) { From 7b1936c294d3ba6867141e0ba27789d610bde7df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20=C5=BB=C3=B3=C5=82tak?= Date: Sat, 7 Dec 2024 10:23:55 +0100 Subject: [PATCH 6/6] TarTestCase::testReadCurrentEntry() added --- tests/TarTestCase.php | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/TarTestCase.php b/tests/TarTestCase.php index e4de42b..0df3a68 100644 --- a/tests/TarTestCase.php +++ b/tests/TarTestCase.php @@ -804,6 +804,31 @@ public function testNumberEncodeDecode() $this->assertEquals($refValue, $decoded); } + public function testReadCurrentEntry() + { + $tar = new Tar(); + $tar->open(__DIR__ . '/tar/test.tar'); + $out = sys_get_temp_dir() . '/dwtartest' . md5(time()); + $tar->extract($out); + + $tar = new Tar(); + $tar->open(__DIR__ . '/tar/test.tar'); + $pathsRead = array(); + foreach($tar->yieldContents() as $i) { + $this->assertFileExists($out . '/' . $i->getPath()); + if ($i->getIsdir()) { + $this->assertEquals('', $tar->readCurrentEntry()); + } else { + $this->assertStringEqualsFile($out . '/' . $i->getPath(), $tar->readCurrentEntry()); + } + $pathsRead[] = $i->getPath(); + } + $pathsReadRef = array('tar', 'tar/testdata1.txt', 'tar/foobar', 'tar/foobar/testdata2.txt'); + $this->assertEquals($pathsReadRef, $pathsRead); + + self::RDelete($out); + } + /** * recursive rmdir()/unlink() *