diff --git a/pigz.c b/pigz.c index bbbfd2c..2b99f55 100644 --- a/pigz.c +++ b/pigz.c @@ -1,6 +1,6 @@ /* pigz.c -- parallel implementation of gzip * Copyright (C) 2007-2017 Mark Adler - * Version 2.4 26 Dec 2017 Mark Adler + * Version 2.4.1x xx Dec 2017 Mark Adler */ /* @@ -188,7 +188,7 @@ Fix sign error in compression reduction percentage */ -#define VERSION "pigz 2.4\n" +#define VERSION "pigz 2.4.1x" /* To-do: - make source portable for Windows, VMS, etc. (see gzip source code) @@ -352,8 +352,10 @@ #if __STDC_VERSION__-0 >= 199901L || __GNUC__-0 >= 3 # include // intmax_t, uintmax_t typedef uintmax_t length_t; + typedef uint32_t crc_t; #else typedef unsigned long length_t; + typedef unsigned long crc_t; #endif #ifdef PIGZ_DEBUG @@ -493,8 +495,10 @@ #define INBUFS(p) (((p)<<1)+3) #define OUTPOOL(s) ((s)+((s)>>4)+DICT) -// Input buffer size. -#define BUF 32768U +// Input buffer size, and augmentation for re-inserting a central header. +#define BUF 32768 +#define CEN 42 +#define EXT (BUF + CEN) // provide enough room to unget a header // Globals (modified by main thread only when it's the only thread). local struct { @@ -512,10 +516,12 @@ local struct { int force; // true to overwrite, compress links, cat int sync; // true to flush output file int form; // gzip = 0, zlib = 1, zip = 2 or 3 - unsigned char magic1; // first byte of possible header when decoding + int magic1; // first byte of possible header when decoding int recurse; // true to dive down into directory structure char *sufx; // suffix to use (".gz" or user supplied) - char *name; // name for gzip header + char *name; // name for gzip or zip header + char *alias; // name for zip header when input is stdin + char *comment; // comment for gzip or zip header. time_t mtime; // time stamp from input file for gzip header int list; // true to list files instead of compress int first; // true if we need to print listing header @@ -528,16 +534,19 @@ local struct { int procs; // maximum number of compression threads (>= 1) int setdict; // true to initialize dictionary in each thread size_t block; // uncompressed input size per thread (>= 32K) + crc_t shift; // pre-calculated CRC-32 shift for length block // saved gzip/zip header data for decompression, testing, and listing time_t stamp; // time stamp from gzip header char *hname; // name from header (allocated) + char *hcomm; // comment from header (allocated) unsigned long zip_crc; // local header crc length_t zip_clen; // local header compressed length length_t zip_ulen; // local header uncompressed length + int zip64; // true if has zip64 extended information // globals for decompression and listing buffered reading - unsigned char in_buf[BUF]; // input buffer + unsigned char in_buf[EXT]; // input buffer unsigned char *in_next; // next unused byte in buffer size_t in_left; // number of unused bytes in buffer int in_eof; // true if reached end of file on input @@ -548,7 +557,7 @@ local struct { #ifndef NOTHREAD // globals for decompression parallel reading - unsigned char in_buf2[BUF]; // second buffer for parallel reads + unsigned char in_buf2[EXT]; // second buffer for parallel reads size_t in_len; // data waiting in next buffer int in_which; // -1: start, 0: in_buf2, 1: in_buf lock *load_state; // value = 0 to wait, 1 to read a buffer @@ -576,13 +585,18 @@ local int complain(char *fmt, ...) { // Memory tracking. +#define MAXMEM 131072 // maximum number of tracked pointers + local struct mem_track_s { size_t num; // current number of allocations size_t size; // total size of current allocations + size_t tot; // maximum number of allocations size_t max; // maximum size of allocations #ifndef NOTHREAD lock *lock; // lock for access across threads #endif + size_t have; // number in array (possibly != num) + void *mem[MAXMEM]; // sorted array of allocated pointers } mem_track; #ifndef NOTHREAD @@ -593,50 +607,80 @@ local struct mem_track_s { # define mem_track_drop(m) #endif -local void *malloc_track(struct mem_track_s *mem, size_t size) { - void *ptr; +// Return the leftmost insert location of ptr in the sorted list mem->mem[], +// which currently has mem->have elements. If ptr is already in the list, the +// returned value will point to its first occurrence. The return location will +// be one after the last element if ptr is greater than all of the elements. +local size_t search_track(struct mem_track_s *mem, void *ptr) { + ptrdiff_t left = 0; + ptrdiff_t right = mem->have - 1; + while (left <= right) { + ptrdiff_t mid = (left + right) >> 1; + if (mem->mem[mid] < ptr) + left = mid + 1; + else + right = mid - 1; + } + return left; +} - ptr = malloc(size); - if (ptr != NULL) { - size = MALLOC_SIZE(ptr); - mem_track_grab(mem); - mem->num++; - mem->size += size; - if (mem->size > mem->max) - mem->max = mem->size; - mem_track_drop(mem); +// Insert ptr in the sorted list mem->mem[] and update the memory allocation +// statistics. +local void insert_track(struct mem_track_s *mem, void *ptr) { + mem_track_grab(mem); + assert(mem->have < MAXMEM && "increase MAXMEM in source and try again"); + size_t i = search_track(mem, ptr); + if (i < mem->have && mem->mem[i] == ptr) + complain("mem_track: duplicate pointer %p\n", ptr); + memmove(&mem->mem[i + 1], &mem->mem[i], + (mem->have - i) * sizeof(void *)); + mem->mem[i] = ptr; + mem->have++; + mem->num++; + mem->size += MALLOC_SIZE(ptr); + if (mem->num > mem->tot) + mem->tot = mem->num; + if (mem->size > mem->max) + mem->max = mem->size; + mem_track_drop(mem); +} + +// Find and delete ptr from the sorted list mem->mem[] and update the memory +// allocation statistics. +local void delete_track(struct mem_track_s *mem, void *ptr) { + mem_track_grab(mem); + size_t i = search_track(mem, ptr); + if (i < mem->num && mem->mem[i] == ptr) { + memmove(&mem->mem[i], &mem->mem[i + 1], + (mem->have - (i + 1)) * sizeof(void *)); + mem->have--; } + else + complain("mem_track: missing pointer %p\n", ptr); + mem->num--; + mem->size -= MALLOC_SIZE(ptr); + mem_track_drop(mem); +} + +local void *malloc_track(struct mem_track_s *mem, size_t size) { + void *ptr = malloc(size); + if (ptr != NULL) + insert_track(mem, ptr); return ptr; } local void *realloc_track(struct mem_track_s *mem, void *ptr, size_t size) { - size_t was; - if (ptr == NULL) return malloc_track(mem, size); - was = MALLOC_SIZE(ptr); - ptr = realloc(ptr, size); - if (ptr != NULL) { - size = MALLOC_SIZE(ptr); - mem_track_grab(mem); - mem->size -= was; - mem->size += size; - if (mem->size > mem->max) - mem->max = mem->size; - mem_track_drop(mem); - } - return ptr; + delete_track(mem, ptr); + void *got = realloc(ptr, size); + insert_track(mem, got == NULL ? ptr : got); + return got; } local void free_track(struct mem_track_s *mem, void *ptr) { - size_t size; - if (ptr != NULL) { - size = MALLOC_SIZE(ptr); - mem_track_grab(mem); - mem->num--; - mem->size -= size; - mem_track_drop(mem); + delete_track(mem, ptr); free(ptr); } } @@ -708,7 +752,9 @@ local void log_init(void) { if (log_tail == NULL) { mem_track.num = 0; mem_track.size = 0; + mem_track.num = 0; mem_track.max = 0; + mem_track.have = 0; #ifndef NOTHREAD mem_track.lock = new_lock(0); yarn_mem(yarn_malloc, yarn_free); @@ -818,8 +864,8 @@ local void log_dump(void) { complain("memory leak: %lu allocs of %lu bytes total", mem_track.num, mem_track.size); if (mem_track.max) - fprintf(stderr, "%lu bytes of memory used\n", - (unsigned long)mem_track.max); + fprintf(stderr, "%lu bytes of memory used in %lu allocs\n", + mem_track.max, mem_track.tot); } // Debugging macro. @@ -1017,7 +1063,7 @@ local unsigned put(int out, ...) { // write wrap[] to out and return the number of bytes written writen(out, wrap, count); - free(wrap); + FREE(wrap); return count; } @@ -1042,13 +1088,13 @@ local length_t put_header(void) { 4, (val_t)0, // crc (not here) 4, (val_t)LOW32, // compressed length (not here) 4, (val_t)LOW32, // uncompressed length (not here) - 2, (val_t)(g.name == NULL ? 1 : strlen(g.name)), // name length + 2, (val_t)(strlen(g.name == NULL ? g.alias : g.name)), // name len 2, (val_t)29, // length of extra field (see below) 0); - // write file name (use "-" for stdin) - len += writen(g.outd, g.name == NULL ? "-" : g.name, - g.name == NULL ? 1 : strlen(g.name)); + // write file name (use g.alias for stdin) + len += writen(g.outd, g.name == NULL ? g.alias : g.name, + strlen(g.name == NULL ? g.alias : g.name)); // write Zip64 and extended timestamp extra field blocks (29 bytes) len += put(g.outd, @@ -1063,6 +1109,8 @@ local length_t put_header(void) { 0); } else if (g.form) { // zlib + if (g.comment != NULL) + complain("can't store comment in zlib format -- ignoring"); unsigned head; head = (0x78 << 8) + // deflate, 32K window (g.level >= 9 ? 3 << 6 : @@ -1079,13 +1127,16 @@ local length_t put_header(void) { 1, (val_t)31, 1, (val_t)139, 1, (val_t)8, // deflate - 1, (val_t)(g.name != NULL ? 8 : 0), + 1, (val_t)((g.name != NULL ? 8 : 0) + + (g.comment != NULL ? 16 : 0)), 4, (val_t)g.mtime, 1, (val_t)(g.level >= 9 ? 2 : g.level == 1 ? 4 : 0), 1, (val_t)3, // unix 0); if (g.name != NULL) len += writen(g.outd, g.name, strlen(g.name) + 1); + if (g.comment != NULL) + len += writen(g.outd, g.comment, strlen(g.comment) + 1); } return len; } @@ -1119,18 +1170,18 @@ local void put_trailer(length_t ulen, length_t clen, 4, (val_t)check, // crc 4, (val_t)(zip64 ? LOW32 : clen), // compressed length 4, (val_t)(zip64 ? LOW32 : ulen), // uncompressed length - 2, (val_t)(g.name == NULL ? 1 : strlen(g.name)), // name length + 2, (val_t)(strlen(g.name == NULL ? g.alias : g.name)), // name len 2, (val_t)(zip64 ? 29 : 9), // extra field size (see below) - 2, (val_t)0, // no file comment + 2, (val_t)(g.comment == NULL ? 0 : strlen(g.comment)), // comment 2, (val_t)0, // disk number 0 2, (val_t)0, // internal file attributes 4, (val_t)0, // external file attributes (ignored) 4, (val_t)0, // offset of local header 0); - // write file name (use "-" for stdin) - cent += writen(g.outd, g.name == NULL ? "-" : g.name, - g.name == NULL ? 1 : strlen(g.name)); + // write file name (use g.alias for stdin) + cent += writen(g.outd, g.name == NULL ? g.alias : g.name, + strlen(g.name == NULL ? g.alias : g.name)); // write Zip64 extra field block (20 bytes) if (zip64) @@ -1149,6 +1200,10 @@ local void put_trailer(length_t ulen, length_t clen, 4, (val_t)g.mtime, // mod time 0); + // write comment, if requested + if (g.comment != NULL) + cent += writen(g.outd, g.comment, strlen(g.comment)); + // here zip64 is true if the offset of the central directory does not // fit in 32 bits, in which case insert the Zip64 end records to // provide a 64-bit offset @@ -1256,76 +1311,53 @@ local long zlib_vernum(void) { // We copy the combination routines from zlib here, in order to avoid linkage // issues with the zlib 1.2.3 builds on Sun, Ubuntu, and others. -local unsigned long gf2_matrix_times(unsigned long *mat, unsigned long vec) { - unsigned long sum; +// CRC-32 polynomial, reflected. +#define POLY 0xedb88320 - sum = 0; - while (vec) { - if (vec & 1) - sum ^= *mat; - vec >>= 1; - mat++; +// Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC +// polynomial, reflected. For speed, this requires that a not be zero. +local crc_t multmodp(crc_t a, crc_t b) { + crc_t m = (crc_t)1 << 31; + crc_t p = 0; + for (;;) { + if (a & m) { + p ^= b; + if ((a & (m - 1)) == 0) + break; + } + m >>= 1; + b = b & 1 ? (b >> 1) ^ POLY : b >> 1; } - return sum; + return p; } -local void gf2_matrix_square(unsigned long *square, unsigned long *mat) { - int n; - - for (n = 0; n < 32; n++) - square[n] = gf2_matrix_times(mat, mat[n]); +// Table of x^2^n modulo p(x). +local const crc_t x2n_table[] = { + 0x40000000, 0x20000000, 0x08000000, 0x00800000, 0x00008000, + 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, + 0xd7bbfe6a, 0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, + 0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3, 0x31fec169, + 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, + 0x2e4e5eef, 0x4eaba214, 0xa8a472c0, 0x429a969e, 0x148d302a, + 0xc40ba6d0, 0xc4e22c3c}; + +// Return x^(n*2^k) modulo p(x). +local crc_t x2nmodp(size_t n, unsigned k) { + crc_t p = (crc_t)1 << 31; // x^0 == 1 + while (n) { + if (n & 1) + p = multmodp(x2n_table[k & 31], p); + n >>= 1; + k++; + } + return p; } +// This uses the pre-computed g.shift value most of the time. Only the last +// combination requires a new x2nmodp() calculation. local unsigned long crc32_comb(unsigned long crc1, unsigned long crc2, size_t len2) { - int n; - unsigned long row; - unsigned long even[32]; // even-power-of-two zeros operator - unsigned long odd[32]; // odd-power-of-two zeros operator - - // degenerate case - if (len2 == 0) - return crc1; - - // put operator for one zero bit in odd - odd[0] = 0xedb88320UL; // CRC-32 polynomial - row = 1; - for (n = 1; n < 32; n++) { - odd[n] = row; - row <<= 1; - } - - // put operator for two zero bits in even - gf2_matrix_square(even, odd); - - // put operator for four zero bits in odd - gf2_matrix_square(odd, even); - - // apply len2 zeros to crc1 (first square will put the operator for one - // zero byte, eight zero bits, in even) - do { - // apply zeros operator for this bit of len2 - gf2_matrix_square(even, odd); - if (len2 & 1) - crc1 = gf2_matrix_times(even, crc1); - len2 >>= 1; - - // if no more bits set, then done - if (len2 == 0) - break; - - // another iteration of the loop with odd and even swapped - gf2_matrix_square(odd, even); - if (len2 & 1) - crc1 = gf2_matrix_times(odd, crc1); - len2 >>= 1; - - // if no more bits set, then done - } while (len2 != 0); - - // return combined crc - crc1 ^= crc2; - return crc1; + return multmodp(len2 == g.block ? g.shift : x2nmodp(len2, 3), crc1) ^ crc2; } #define BASE 65521U // largest prime smaller than 65536 @@ -1403,9 +1435,9 @@ local struct space *get_space(struct pool *pool) { // if a space is available, pull it from the list and return it if (pool->head != NULL) { space = pool->head; - possess(space->use); pool->head = space->next; twist(pool->have, BY, -1); // one less in pool + possess(space->use); twist(space->use, TO, 1); // initially one user space->len = 0; return space; @@ -1443,7 +1475,11 @@ local void grow_space(struct space *space) { // Increment the use count to require one more drop before returning this space // to the pool. local void use_space(struct space *space) { + long use; + possess(space->use); + use = peek_lock(space->use); + assert(use != 0); twist(space->use, BY, +1); } @@ -1457,6 +1493,7 @@ local void drop_space(struct space *space) { possess(space->use); use = peek_lock(space->use); assert(use != 0); + twist(space->use, BY, -1); if (use == 1) { pool = space->pool; possess(pool->have); @@ -1464,7 +1501,6 @@ local void drop_space(struct space *space) { pool->head = space; twist(pool->have, BY, +1); } - twist(space->use, BY, -1); } // Free the memory and lock resources of a pool. Return number of spaces for @@ -1926,6 +1962,7 @@ local void write_thread(void *dummy) { wait_for(job->calc, TO_BE, 1); release(job->calc); check = COMB(check, job->check, len); + Trace(("-- combined #%ld%s", seq, more ? "" : " (last)")); // free the job free_lock(job->calc); @@ -2362,10 +2399,9 @@ local void single_compress(int reset) { } } else -#else - { DEFLATE_WRITE(Z_SYNC_FLUSH); - } +#else + DEFLATE_WRITE(Z_SYNC_FLUSH); #endif if (!g.setdict) // two markers when independent DEFLATE_WRITE(Z_FULL_FLUSH); @@ -2545,7 +2581,7 @@ local size_t load(void) { } // Terminate the load() operation. Empty buffer, mark end, close file (if not -// stdin), and free the name obtained from the header, if any. +// stdin), and free the name and comment obtained from the header, if present. local void load_end(void) { #ifndef NOTHREAD // if the read thread is running, then end it @@ -2568,6 +2604,7 @@ local void load_end(void) { if (g.ind != 0) close(g.ind); RELEASE(g.hname); + RELEASE(g.hcomm); } // Initialize for reading new input. @@ -2584,7 +2621,6 @@ local void in_init(void) { // Buffered reading macros for decompression and listing. #define GET() (g.in_left == 0 && (g.in_eof || load() == 0) ? 0 : \ (g.in_left--, *g.in_next++)) -#define UNGET(n) (g.in_left += (n), g.in_next -= (n)) #define GET2() (tmp2 = GET(), tmp2 + ((unsigned)(GET()) << 8)) #define GET4() (tmp4 = GET2(), tmp4 + ((unsigned long)(GET2()) << 16)) #define SKIP(dist) \ @@ -2618,6 +2654,24 @@ local void in_init(void) { g.in_next += togo; \ } while (0) +// Get a zero-terminated string into allocated memory, with crc update. +#define GETZC(str) \ + do { \ + unsigned char *end; \ + size_t copy, have, size = 0; \ + have = 0; \ + do { \ + if (g.in_left == 0 && load() == 0) \ + return -3; \ + end = memchr(g.in_next, 0, g.in_left); \ + copy = end == NULL ? g.in_left : (size_t)(end - g.in_next) + 1; \ + have = vmemcpy(&str, &size, have, g.in_next, copy); \ + g.in_left -= copy; \ + g.in_next += copy; \ + } while (end == NULL); \ + crc = crc32z(crc, (unsigned char *)str, have); \ + } while (0) + // Pull LSB order or MSB order integers from an unsigned char buffer. #define PULL2L(p) ((p)[0] + ((unsigned)((p)[1]) << 8)) #define PULL4L(p) (PULL2L(p) + ((unsigned long)(PULL2L((p) + 2)) << 16)) @@ -2663,6 +2717,7 @@ local int read_extra(unsigned len, int save) { len -= size; if (id == 0x0001) { // Zip64 Extended Information Extra Field + g.zip64 = 1; if (g.zip_ulen == LOW32 && size >= 8) { g.zip_ulen = GET4(); SKIP(4); @@ -2720,26 +2775,28 @@ local int get_header(int save) { if (save) { g.stamp = 0; RELEASE(g.hname); + RELEASE(g.hcomm); } // see if it's a gzip, zlib, or lzw file - g.form = -1; g.magic1 = GET(); - if (g.in_eof) + if (g.in_eof) { + g.magic1 = -1; return -1; + } magic = (unsigned)g.magic1 << 8; magic += GET(); - if (g.in_eof) { - UNGET(1); + if (g.in_eof) return -2; - } if (magic % 31 == 0 && (magic & 0x8f20) == 0x0800) { // it's zlib g.form = 1; return 8; } - if (magic == 0x1f9d) // it's lzw + if (magic == 0x1f9d) { // it's lzw + g.form = -1; return 257; + } if (magic == 0x504b) { // it's zip magic = GET2(); // the rest of the signature if (g.in_eof) @@ -2748,9 +2805,10 @@ local int get_header(int save) { return -5; // central header or archive extra if (magic != 0x0403) return -4; // not a local header + g.zip64 = 0; SKIP(2); flags = GET2(); - if (flags & 0xfff0) + if (flags & 0xf7f0) return -4; method = GET(); // return low byte of method or 256 if (GET() != 0 || flags & 1) @@ -2790,7 +2848,8 @@ local int get_header(int save) { return g.in_eof ? -3 : (int)method; } if (magic != 0x1f8b) { // not gzip - UNGET(2); + g.in_left++; // return the second byte + g.in_next--; return -2; } @@ -2815,29 +2874,22 @@ local int get_header(int save) { SKIPC(GET2C()); // read file name, if present, into allocated memory - if ((flags & 8) && save) { - unsigned char *end; - size_t copy, have, size = 0; - have = 0; - do { - if (g.in_left == 0 && load() == 0) - return -3; - end = memchr(g.in_next, 0, g.in_left); - copy = end == NULL ? g.in_left : (size_t)(end - g.in_next) + 1; - have = vmemcpy(&g.hname, &size, have, g.in_next, copy); - g.in_left -= copy; - g.in_next += copy; - } while (end == NULL); - crc = crc32z(crc, (unsigned char *)g.hname, have); + if (flags & 8) { + if (save) + GETZC(g.hname); + else + while (GETC() != 0) + ; } - else if (flags & 8) - while (GETC() != 0) - ; - // skip comment - if (flags & 16) - while (GETC() != 0) - ; + // read comment, if present, into allocated memory + if (flags & 16) { + if (save) + GETZC(g.hcomm); + else + while (GETC() != 0) + ; + } // check header crc if ((flags & 2) && GET2() != (crc & 0xffff)) @@ -2848,6 +2900,122 @@ local int get_header(int save) { return g.in_eof ? -3 : (int)method; } +// Process the remainder of a zip file after the first entry. Return true if +// the next signature is another local file header. If listing verbosely, then +// search the remainder of the zip file for the central file header +// corresponding to the first zip entry, and save the file comment, if any. +local int more_zip_entries(void) { + unsigned long sig; + int ret, n; + unsigned char *first; + unsigned tmp2; // for macro + unsigned long tmp4; // for macro + unsigned char const central[] = {0x50, 0x4b, 1, 2}; + + sig = GET4(); + ret = !g.in_eof && sig == 0x04034b50; // true if another entry follows + if (!g.list || g.verbosity < 2) + return ret; + + // if it was a central file header signature, then already four bytes + // into a central directory header -- otherwise search for the next one + n = sig == 0x02014b50 ? 4 : 0; // number of bytes into central header + for (;;) { + // assure that more input is available + if (g.in_left == 0 && load() == 0) // never found it! + return ret; + if (n == 0) { + // look for first byte in central signature + first = memchr(g.in_next, central[0], g.in_left); + if (first == NULL) { + // not found -- go get the next buffer and keep looking + g.in_left = 0; + } + else { + // found -- continue search at next byte + n++; + g.in_left -= first - g.in_next + 1; + g.in_next = first + 1; + } + } + else if (n < 4) { + // look for the remaining bytes in the central signature + if (g.in_next[0] == central[n]) { + n++; + g.in_next++; + g.in_left--; + } + else + n = 0; // mismatch -- restart search with this byte + } + else { + // Now in a suspected central file header, just past the signature. + // Read the rest of the fixed-length portion of the header. + unsigned char head[CEN]; + size_t need = CEN, part = 0, len, i; + + if (need > g.in_left) { // will only need to do this once + part = g.in_left; + memcpy(head + CEN - need, g.in_next, part); + need -= part; + g.in_left = 0; + if (load() == 0) // never found it! + return ret; + } + memcpy(head + CEN - need, g.in_next, need); + + // Determine to sufficient probability that this is the droid we're + // looking for, by checking the CRC and the local header offset. + if (PULL4L(head + 12) == g.out_check && PULL4L(head + 38) == 0) { + // Update the number of bytes consumed from the current buffer. + g.in_next += need; + g.in_left -= need; + + // Get the comment length. + len = PULL2L(head + 28); + if (len == 0) // no comment + return ret; + + // Skip the file name and extra field. + SKIP(PULL2L(head + 24) + (unsigned long)PULL2L(head + 26)); + + // Save the comment field. + need = len; + g.hcomm = alloc(NULL, len + 1); + while (need > g.in_left) { + memcpy(g.hcomm + len - need, g.in_next, g.in_left); + need -= g.in_left; + g.in_left = 0; + if (load() == 0) { // premature EOF + RELEASE(g.hcomm); + return ret; + } + } + memcpy(g.hcomm + len - need, g.in_next, need); + g.in_next += need; + g.in_left -= need; + for (i = 0; i < len; i++) + if (g.hcomm[i] == 0) + g.hcomm[i] = ' '; + g.hcomm[len] = 0; + return ret; + } + else { + // Nope, false alarm. Restart the search at the first byte + // after what we thought was the central file header signature. + if (part) { + // Move buffer data up and insert the part of the header + // data read from the previous buffer. + memmove(g.in_next + part, g.in_next, g.in_left); + memcpy(g.in_next, head, part); + g.in_left += part; + } + n = 0; + } + } + } +} + // --- list contents of compressed input (gzip, zlib, or lzw) --- // Find standard compressed file suffix, return length of suffix. @@ -2882,7 +3050,7 @@ local size_t compressed_suffix(char *nm) { #define NAMEMAX1 48 // name display limit at verbosity 1 #define NAMEMAX2 16 // name display limit at verbosity 2 -// Print gzip or lzw file information. +// Print gzip, lzw, zlib, or zip file information. local void show_info(int method, unsigned long check, length_t len, int cont) { size_t max; // maximum name length for current verbosity size_t n; // name length without suffix @@ -2961,6 +3129,8 @@ local void show_info(int method, unsigned long check, length_t len, int cont) { g.in_tot, len, red, tag); #endif } + if (g.verbosity > 1 && g.hcomm != NULL) + puts(g.hcomm); } // List content information about the gzip file at ind (only works if the gzip @@ -2994,6 +3164,7 @@ local void list_info(void) { // list zip file if (g.form > 1) { + more_zip_entries(); // get first entry comment, if any g.in_tot = g.zip_clen; show_info(method, g.zip_crc, g.zip_ulen, 0); return; @@ -3087,6 +3258,12 @@ local void list_info(void) { // --- copy input to output (when acting like cat) --- local void cat(void) { + // copy the first header byte read, if any + if (g.magic1 != -1) { + unsigned char buf[1] = {g.magic1}; + g.out_tot += writen(g.outd, buf, 1); + } + // copy the remainder of the input to the output while (g.in_left) { g.out_tot += writen(g.outd, g.in_next, g.in_left); @@ -3177,6 +3354,8 @@ local void outb_check(void *dummy) { // write and check threads and return for more decompression while that's going // on (or just write and check if no threads or if proc == 1). local int outb(void *desc, unsigned char *buf, unsigned len) { + (void)desc; + #ifndef NOTHREAD static thread *wr, *ch; @@ -3205,14 +3384,8 @@ local int outb(void *desc, unsigned char *buf, unsigned len) { // if requested with len == 0, clean up -- terminate and join write and // check threads, free lock if (len == 0 && outb_write_more != NULL) { - if (desc != NULL) { - destruct(ch); - destruct(wr); - } - else { - join(ch); - join(wr); - } + join(ch); + join(wr); free_lock(outb_check_more); free_lock(outb_write_more); outb_write_more = NULL; @@ -3225,8 +3398,6 @@ local int outb(void *desc, unsigned char *buf, unsigned len) { } #endif - (void)desc; - // if just one process or no threads, then do it without threads if (len) { if (g.decode == 1) @@ -3237,19 +3408,24 @@ local int outb(void *desc, unsigned char *buf, unsigned len) { return 0; } +// Zip file data descriptor signature. This signature may or may not precede +// the CRC and lengths, with either resulting in a valid zip file! There is +// some odd code below that tries to detect and accommodate both cases. +#define SIG 0x08074b50 + // Inflate for decompression or testing. Decompress from ind to outd unless // decode != 1, in which case just test ind, and then also list if list != 0; // look for and decode multiple, concatenated gzip and/or zlib streams; read // and check the gzip, zlib, or zip trailer. local void infchk(void) { - int ret, cont, was; + int ret, cont, more; unsigned long check, len; z_stream strm; unsigned tmp2; unsigned long tmp4; length_t clen; - cont = 0; + cont = more = 0; do { // header already read -- set up for decompression g.in_tot = g.in_left; // track compressed data length @@ -3286,36 +3462,56 @@ local void infchk(void) { // read and check trailer if (g.form > 1) { // zip local trailer (if any) if (g.form == 3) { // data descriptor follows - // read original version of data descriptor + // get data descriptor values, assuming no signature g.zip_crc = GET4(); g.zip_clen = GET4(); - g.zip_ulen = GET4(); - if (g.in_eof) - throw(EDOM, "%s: corrupted entry -- missing trailer", - g.inf); - - // if crc doesn't match, try info-zip variant with sig - if (g.zip_crc != g.out_check) { - if (g.zip_crc != 0x08074b50UL || g.zip_clen != g.out_check) - throw(EDOM, "%s: corrupted entry -- crc32 mismatch", - g.inf); - g.zip_crc = g.zip_clen; - g.zip_clen = g.zip_ulen; - g.zip_ulen = GET4(); - } - - // handle incredibly rare cases where crc equals signature - else if (g.zip_crc == 0x08074b50UL && - g.zip_clen == g.zip_crc && - ((clen & LOW32) != g.zip_crc || - g.zip_ulen == g.zip_crc)) { + g.zip_ulen = GET4(); // ZIP64 -> high clen, not ulen + + // deduce whether or not a signature precedes the values + if (g.zip_crc == SIG && // might be the signature + // if the expected CRC is not SIG, then it's a signature + (g.out_check != SIG || // assume signature + // now we're in a very rare case where CRC == SIG -- the + // first four bytes could be the signature or the CRC + (g.zip_clen == SIG && // if not, then no signature + // now we have the first two words are SIG and the + // expected CRC is SIG, so it could be a signature and + // the CRC, or it could be the CRC and a compressed + // length that is *also* SIG (!) -- so check the low 32 + // bits of the expected compressed length for SIG + ((clen & LOW32) != SIG || // assume signature and CRC + // now the expected CRC *and* the expected low 32 bits + // of the compressed length are SIG -- this is so + // incredibly unlikely, clearly someone is messing with + // us, but we continue ... if the next four bytes are + // not SIG, then there is not a signature -- check those + // bytes, currently in g.zip_ulen: + (g.zip_ulen == SIG && // if not, then no signature + // we have three SIGs in a row in the descriptor, and + // both the expected CRC and the expected clen are SIG + // -- the first one is a signature if we don't expect + // the third word to be SIG, which is either the low 32 + // bits of ulen, or if ZIP64, the high 32 bits of clen: + (g.zip64 ? clen >> 32 : g.out_tot) != SIG + // if that last compare was equal, then the expected + // values for the CRC, the low 32 bits of clen, *and* + // the low 32 bits of ulen are all SIG (!!), or in the + // case of ZIP64, even crazier, the CRC and *both* + // 32-bit halves of clen are all SIG (clen > 500 + // petabytes!!!) ... we can no longer discriminate the + // hypotheses, so we will assume no signature + ))))) { + // first four bytes were actually the descriptor -- shift + // the values down and get another four bytes g.zip_crc = g.zip_clen; g.zip_clen = g.zip_ulen; g.zip_ulen = GET4(); } - // if second length doesn't match, try 64-bit lengths - if (g.zip_ulen != (g.out_tot & LOW32)) { + // if ZIP64, then ulen is really the high word of clen -- get + // the actual ulen and skip its high word as well (we only + // compare the low 32 bits of the lengths to verify) + if (g.zip64) { g.zip_ulen = GET4(); (void)GET4(); } @@ -3323,11 +3519,14 @@ local void infchk(void) { throw(EDOM, "%s: corrupted entry -- missing trailer", g.inf); } + check = g.zip_crc; + if (check != g.out_check) + throw(EDOM, "%s: corrupted entry -- crc32 mismatch", g.inf); if (g.zip_clen != (clen & LOW32) || g.zip_ulen != (g.out_tot & LOW32)) throw(EDOM, "%s: corrupted entry -- length mismatch", g.inf); - check = g.zip_crc; + more = more_zip_entries(); // see if more entries, get comment } else if (g.form == 1) { // zlib (big-endian) trailer check = (unsigned long)(GET()) << 24; @@ -3359,16 +3558,20 @@ local void infchk(void) { // if a gzip entry follows a gzip entry, decompress it (don't replace // saved header information from first entry) - was = g.form; - } while (was == 0 && (ret = get_header(0)) == 8 && g.form == 0); + } while (g.form == 0 && (ret = get_header(0)) == 8); // gzip -cdf copies junk after gzip stream directly to output - if (was == 0 && ret == -2 && g.force && g.pipeout && g.decode != 2 && + if (g.form == 0 && ret == -2 && g.force && g.pipeout && g.decode != 2 && !g.list) cat(); - else if (was > 1 && get_header(0) != -5) + + // check for more entries in zip file + else if (more) complain("warning: %s: entries after the first were ignored", g.inf); - else if ((was == 0 && ret != -1) || (was == 1 && (GET(), !g.in_eof))) + + // check for non-gzip after gzip stream, or anything after zlib stream + else if ((g.verbosity > 1 && g.form == 0 && ret != -1) || + (g.form == 1 && (GET(), !g.in_eof))) complain("warning: %s: trailing junk was ignored", g.inf); } @@ -3817,7 +4020,7 @@ local void process(char *path) { punt(err); complain("skipping: %s", err.why); drop(err); - outb(&g, NULL, 0); + outb(NULL, NULL, 0); } load_end(); return; @@ -3918,7 +4121,7 @@ local void process(char *path) { punt(err); complain("skipping: %s", err.why); drop(err); - outb(g.outf, NULL, 0); + outb(NULL, NULL, 0); if (g.outd != -1 && g.outd != 1) { close(g.outd); g.outd = -1; @@ -3974,8 +4177,10 @@ local char *helptext[] = { " -0 to -9, -11 Compression level (level 11, zopfli, is much slower)", #endif " --fast, --best Compression levels 1 and 9 respectively", +" -A, --alias xxx Use xxx as the name for any --zip entry from stdin", " -b, --blocksize mmm Set compression block size to mmmK (default 128K)", " -c, --stdout Write all processed output to stdout (won't delete)", +" -C, --comment ccc Put comment ccc in the gzip or zip header", " -d, --decompress Decompress the compressed input", " -f, --force Force overwrite, compress .gz, links, and to terminal", #ifndef NOZOPFLI @@ -4071,6 +4276,7 @@ local void defaults(void) { g.procs = nprocs(8); #endif g.block = 131072UL; // 128K + g.shift = x2nmodp(g.block, 3); g.rsync = 0; // don't do rsync blocking g.setdict = 1; // initialize dictionary each thread g.verbosity = 1; // normal message level @@ -4079,6 +4285,7 @@ local void defaults(void) { // where 01 is name and 10 is time g.pipeout = 0; // don't force output to stdout g.sufx = ".gz"; // compressed file suffix + g.comment = NULL; // no comment g.decode = 0; // compress g.list = 0; // compress g.keep = 0; // delete input file once compressed @@ -4090,8 +4297,9 @@ local void defaults(void) { // Long options conversion to short options. local char *longopts[][2] = { - {"LZW", "Z"}, {"lzw", "Z"}, {"ascii", "a"}, {"best", "9"}, {"bits", "Z"}, - {"blocksize", "b"}, {"decompress", "d"}, {"fast", "1"}, {"force", "f"}, + {"LZW", "Z"}, {"lzw", "Z"}, {"alias", "A"}, {"ascii", "a"}, {"best", "9"}, + {"bits", "Z"}, {"blocksize", "b"}, {"decompress", "d"}, {"fast", "1"}, + {"force", "f"}, {"comment", "C"}, #ifndef NOZOPFLI {"first", "F"}, {"iterations", "I"}, {"maxsplits", "J"}, {"oneblock", "O"}, #endif @@ -4136,7 +4344,7 @@ local int option(char *arg) { // if no argument or dash option, check status of get if (get && (arg == NULL || *arg == '-')) { - bad[1] = "bpSIM"[get - 1]; + bad[1] = "bpSIJAC"[get - 1]; throw(EINVAL, "missing parameter after %s", bad); } if (arg == NULL) @@ -4168,9 +4376,12 @@ local int option(char *arg) { // options until we have the parameter if (get) { if (get == 3) - throw(EINVAL, "invalid usage: " - "-s must be followed by space"); - break; // allow -pnnn and -bnnn, fall to parameter code + throw(EINVAL, + "invalid usage: -S must be followed by space"); + if (get == 7) + throw(EINVAL, + "invalid usage: -C must be followed by space"); + break; // allow -*nnn to fall to parameter code } // process next single character option or compression level @@ -4187,6 +4398,8 @@ local int option(char *arg) { if (g.level == 10 || g.level > 11) throw(EINVAL, "only levels 0..9 and 11 are allowed"); break; + case 'A': get = 6; break; + case 'C': get = 7; break; #ifndef NOZOPFLI case 'F': g.zopts.blocksplittinglast = 1; break; case 'I': get = 4; break; @@ -4194,11 +4407,10 @@ local int option(char *arg) { #endif case 'K': g.form = 2; g.sufx = ".zip"; break; case 'L': - fputs(VERSION, stderr); - fputs("Copyright (C) 2007-2017 Mark Adler\n", stderr); - fputs("Subject to the terms of the zlib license.\n", - stderr); - fputs("No warranty is provided or implied.\n", stderr); + puts(VERSION); + puts("Copyright (C) 2007-2017 Mark Adler"); + puts("Subject to the terms of the zlib license."); + puts("No warranty is provided or implied."); exit(0); case 'M': g.headis |= 0xa; break; case 'N': g.headis = 0xf; break; @@ -4209,9 +4421,9 @@ local int option(char *arg) { case 'S': get = 3; break; // -T defined below as an alternative for -m case 'V': - fputs(VERSION, stderr); + puts(VERSION); if (g.verbosity > 1) - fprintf(stderr, "zlib %s\n", zlibVersion()); + printf("zlib %s\n", zlibVersion()); exit(0); case 'Y': g.sync = 1; break; case 'Z': @@ -4245,13 +4457,14 @@ local int option(char *arg) { return 1; } - // process option parameter for -b, -p, -S, -I, or -J + // process option parameter for -b, -p, -A, -S, -I, or -J if (get) { size_t n; if (get == 1) { n = num(arg); g.block = n << 10; // chunk size + g.shift = x2nmodp(g.block, 3); if (g.block < DICT) throw(EINVAL, "block size too small (must be >= 32K)"); if (n != g.block >> 10 || @@ -4282,7 +4495,11 @@ local int option(char *arg) { g.zopts.numiterations = (int)num(arg); // optimize iterations else if (get == 5) g.zopts.blocksplittingmax = (int)num(arg); // max block splits + else if (get == 6) + g.alias = arg; // zip name for stdin #endif + else if (get == 7) + g.comment = arg; // header comment get = 0; return 1; } @@ -4294,7 +4511,7 @@ local int option(char *arg) { #ifndef NOTHREAD // handle error received from yarn function local void cut_yarn(int err) { - throw(err, err == ENOMEM ? "not enough memory" : "internal threads error"); + throw(err, "internal threads error"); } #endif @@ -4315,9 +4532,11 @@ int main(int argc, char **argv) { #ifndef NOTHREAD g.in_which = -1; #endif + g.alias = "-"; g.outf = NULL; g.first = 1; g.hname = NULL; + g.hcomm = NULL; // save pointer to program name for error messages p = strrchr(argv[0], '/'); @@ -4342,6 +4561,9 @@ int main(int argc, char **argv) { if (zlib_vernum() < 0x1230) throw(EINVAL, "zlib version less than 1.2.3"); + // create CRC table, in case zlib compiled with dynamic tables + get_crc_table(); + // process user environment variable defaults in GZIP opts = getenv("GZIP"); if (opts != NULL) {