123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449 |
- /* gzjoin -- command to join gzip files into one gzip file
- Copyright (C) 2004 Mark Adler, all rights reserved
- version 1.0, 11 Dec 2004
- This software is provided 'as-is', without any express or implied
- warranty. In no event will the author be held liable for any damages
- arising from the use of this software.
- Permission is granted to anyone to use this software for any purpose,
- including commercial applications, and to alter it and redistribute it
- freely, subject to the following restrictions:
- 1. The origin of this software must not be misrepresented; you must not
- claim that you wrote the original software. If you use this software
- in a product, an acknowledgment in the product documentation would be
- appreciated but is not required.
- 2. Altered source versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
- 3. This notice may not be removed or altered from any source distribution.
- Mark Adler madler@alumni.caltech.edu
- */
- /*
- * Change history:
- *
- * 1.0 11 Dec 2004 - First version
- * 1.1 12 Jun 2005 - Changed ssize_t to long for portability
- */
- /*
- gzjoin takes one or more gzip files on the command line and writes out a
- single gzip file that will uncompress to the concatenation of the
- uncompressed data from the individual gzip files. gzjoin does this without
- having to recompress any of the data and without having to calculate a new
- crc32 for the concatenated uncompressed data. gzjoin does however have to
- decompress all of the input data in order to find the bits in the compressed
- data that need to be modified to concatenate the streams.
- gzjoin does not do an integrity check on the input gzip files other than
- checking the gzip header and decompressing the compressed data. They are
- otherwise assumed to be complete and correct.
- Each joint between gzip files removes at least 18 bytes of previous trailer
- and subsequent header, and inserts an average of about three bytes to the
- compressed data in order to connect the streams. The output gzip file
- has a minimal ten-byte gzip header with no file name or modification time.
- This program was written to illustrate the use of the Z_BLOCK option of
- inflate() and the crc32_combine() function. gzjoin will not compile with
- versions of zlib earlier than 1.2.3.
- */
- #include <stdio.h> /* fputs(), fprintf(), fwrite(), putc() */
- #include <stdlib.h> /* exit(), malloc(), free() */
- #include <fcntl.h> /* open() */
- #include <unistd.h> /* close(), read(), lseek() */
- #include "zlib.h"
- /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
- #define local static
- /* exit with an error (return a value to allow use in an expression) */
- local int bail(char *why1, char *why2)
- {
- fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
- exit(1);
- return 0;
- }
- /* -- simple buffered file input with access to the buffer -- */
- #define CHUNK 32768 /* must be a power of two and fit in unsigned */
- /* bin buffered input file type */
- typedef struct {
- char *name; /* name of file for error messages */
- int fd; /* file descriptor */
- unsigned left; /* bytes remaining at next */
- unsigned char *next; /* next byte to read */
- unsigned char *buf; /* allocated buffer of length CHUNK */
- } bin;
- /* close a buffered file and free allocated memory */
- local void bclose(bin *in)
- {
- if (in != NULL) {
- if (in->fd != -1)
- close(in->fd);
- if (in->buf != NULL)
- free(in->buf);
- free(in);
- }
- }
- /* open a buffered file for input, return a pointer to type bin, or NULL on
- failure */
- local bin *bopen(char *name)
- {
- bin *in;
- in = malloc(sizeof(bin));
- if (in == NULL)
- return NULL;
- in->buf = malloc(CHUNK);
- in->fd = open(name, O_RDONLY, 0);
- if (in->buf == NULL || in->fd == -1) {
- bclose(in);
- return NULL;
- }
- in->left = 0;
- in->next = in->buf;
- in->name = name;
- return in;
- }
- /* load buffer from file, return -1 on read error, 0 or 1 on success, with
- 1 indicating that end-of-file was reached */
- local int bload(bin *in)
- {
- long len;
- if (in == NULL)
- return -1;
- if (in->left != 0)
- return 0;
- in->next = in->buf;
- do {
- len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left);
- if (len < 0)
- return -1;
- in->left += (unsigned)len;
- } while (len != 0 && in->left < CHUNK);
- return len == 0 ? 1 : 0;
- }
- /* get a byte from the file, bail if end of file */
- #define bget(in) (in->left ? 0 : bload(in), \
- in->left ? (in->left--, *(in->next)++) : \
- bail("unexpected end of file on ", in->name))
- /* get a four-byte little-endian unsigned integer from file */
- local unsigned long bget4(bin *in)
- {
- unsigned long val;
- val = bget(in);
- val += (unsigned long)(bget(in)) << 8;
- val += (unsigned long)(bget(in)) << 16;
- val += (unsigned long)(bget(in)) << 24;
- return val;
- }
- /* skip bytes in file */
- local void bskip(bin *in, unsigned skip)
- {
- /* check pointer */
- if (in == NULL)
- return;
- /* easy case -- skip bytes in buffer */
- if (skip <= in->left) {
- in->left -= skip;
- in->next += skip;
- return;
- }
- /* skip what's in buffer, discard buffer contents */
- skip -= in->left;
- in->left = 0;
- /* seek past multiples of CHUNK bytes */
- if (skip > CHUNK) {
- unsigned left;
- left = skip & (CHUNK - 1);
- if (left == 0) {
- /* exact number of chunks: seek all the way minus one byte to check
- for end-of-file with a read */
- lseek(in->fd, skip - 1, SEEK_CUR);
- if (read(in->fd, in->buf, 1) != 1)
- bail("unexpected end of file on ", in->name);
- return;
- }
- /* skip the integral chunks, update skip with remainder */
- lseek(in->fd, skip - left, SEEK_CUR);
- skip = left;
- }
- /* read more input and skip remainder */
- bload(in);
- if (skip > in->left)
- bail("unexpected end of file on ", in->name);
- in->left -= skip;
- in->next += skip;
- }
- /* -- end of buffered input functions -- */
- /* skip the gzip header from file in */
- local void gzhead(bin *in)
- {
- int flags;
- /* verify gzip magic header and compression method */
- if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
- bail(in->name, " is not a valid gzip file");
- /* get and verify flags */
- flags = bget(in);
- if ((flags & 0xe0) != 0)
- bail("unknown reserved bits set in ", in->name);
- /* skip modification time, extra flags, and os */
- bskip(in, 6);
- /* skip extra field if present */
- if (flags & 4) {
- unsigned len;
- len = bget(in);
- len += (unsigned)(bget(in)) << 8;
- bskip(in, len);
- }
- /* skip file name if present */
- if (flags & 8)
- while (bget(in) != 0)
- ;
- /* skip comment if present */
- if (flags & 16)
- while (bget(in) != 0)
- ;
- /* skip header crc if present */
- if (flags & 2)
- bskip(in, 2);
- }
- /* write a four-byte little-endian unsigned integer to out */
- local void put4(unsigned long val, FILE *out)
- {
- putc(val & 0xff, out);
- putc((val >> 8) & 0xff, out);
- putc((val >> 16) & 0xff, out);
- putc((val >> 24) & 0xff, out);
- }
- /* Load up zlib stream from buffered input, bail if end of file */
- local void zpull(z_streamp strm, bin *in)
- {
- if (in->left == 0)
- bload(in);
- if (in->left == 0)
- bail("unexpected end of file on ", in->name);
- strm->avail_in = in->left;
- strm->next_in = in->next;
- }
- /* Write header for gzip file to out and initialize trailer. */
- local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
- {
- fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
- *crc = crc32(0L, Z_NULL, 0);
- *tot = 0;
- }
- /* Copy the compressed data from name, zeroing the last block bit of the last
- block if clr is true, and adding empty blocks as needed to get to a byte
- boundary. If clr is false, then the last block becomes the last block of
- the output, and the gzip trailer is written. crc and tot maintains the
- crc and length (modulo 2^32) of the output for the trailer. The resulting
- gzip file is written to out. gzinit() must be called before the first call
- of gzcopy() to write the gzip header and to initialize crc and tot. */
- local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
- FILE *out)
- {
- int ret; /* return value from zlib functions */
- int pos; /* where the "last block" bit is in byte */
- int last; /* true if processing the last block */
- bin *in; /* buffered input file */
- unsigned char *start; /* start of compressed data in buffer */
- unsigned char *junk; /* buffer for uncompressed data -- discarded */
- z_off_t len; /* length of uncompressed data (support > 4 GB) */
- z_stream strm; /* zlib inflate stream */
- /* open gzip file and skip header */
- in = bopen(name);
- if (in == NULL)
- bail("could not open ", name);
- gzhead(in);
- /* allocate buffer for uncompressed data and initialize raw inflate
- stream */
- junk = malloc(CHUNK);
- strm.zalloc = Z_NULL;
- strm.zfree = Z_NULL;
- strm.opaque = Z_NULL;
- strm.avail_in = 0;
- strm.next_in = Z_NULL;
- ret = inflateInit2(&strm, -15);
- if (junk == NULL || ret != Z_OK)
- bail("out of memory", "");
- /* inflate and copy compressed data, clear last-block bit if requested */
- len = 0;
- zpull(&strm, in);
- start = strm.next_in;
- last = start[0] & 1;
- if (last && clr)
- start[0] &= ~1;
- strm.avail_out = 0;
- for (;;) {
- /* if input used and output done, write used input and get more */
- if (strm.avail_in == 0 && strm.avail_out != 0) {
- fwrite(start, 1, strm.next_in - start, out);
- start = in->buf;
- in->left = 0;
- zpull(&strm, in);
- }
- /* decompress -- return early when end-of-block reached */
- strm.avail_out = CHUNK;
- strm.next_out = junk;
- ret = inflate(&strm, Z_BLOCK);
- switch (ret) {
- case Z_MEM_ERROR:
- bail("out of memory", "");
- case Z_DATA_ERROR:
- bail("invalid compressed data in ", in->name);
- }
- /* update length of uncompressed data */
- len += CHUNK - strm.avail_out;
- /* check for block boundary (only get this when block copied out) */
- if (strm.data_type & 128) {
- /* if that was the last block, then done */
- if (last)
- break;
- /* number of unused bits in last byte */
- pos = strm.data_type & 7;
- /* find the next last-block bit */
- if (pos != 0) {
- /* next last-block bit is in last used byte */
- pos = 0x100 >> pos;
- last = strm.next_in[-1] & pos;
- if (last && clr)
- strm.next_in[-1] &= ~pos;
- }
- else {
- /* next last-block bit is in next unused byte */
- if (strm.avail_in == 0) {
- /* don't have that byte yet -- get it */
- fwrite(start, 1, strm.next_in - start, out);
- start = in->buf;
- in->left = 0;
- zpull(&strm, in);
- }
- last = strm.next_in[0] & 1;
- if (last && clr)
- strm.next_in[0] &= ~1;
- }
- }
- }
- /* update buffer with unused input */
- in->left = strm.avail_in;
- in->next = strm.next_in;
- /* copy used input, write empty blocks to get to byte boundary */
- pos = strm.data_type & 7;
- fwrite(start, 1, in->next - start - 1, out);
- last = in->next[-1];
- if (pos == 0 || !clr)
- /* already at byte boundary, or last file: write last byte */
- putc(last, out);
- else {
- /* append empty blocks to last byte */
- last &= ((0x100 >> pos) - 1); /* assure unused bits are zero */
- if (pos & 1) {
- /* odd -- append an empty stored block */
- putc(last, out);
- if (pos == 1)
- putc(0, out); /* two more bits in block header */
- fwrite("\0\0\xff\xff", 1, 4, out);
- }
- else {
- /* even -- append 1, 2, or 3 empty fixed blocks */
- switch (pos) {
- case 6:
- putc(last | 8, out);
- last = 0;
- case 4:
- putc(last | 0x20, out);
- last = 0;
- case 2:
- putc(last | 0x80, out);
- putc(0, out);
- }
- }
- }
- /* update crc and tot */
- *crc = crc32_combine(*crc, bget4(in), len);
- *tot += (unsigned long)len;
- /* clean up */
- inflateEnd(&strm);
- free(junk);
- bclose(in);
- /* write trailer if this is the last gzip file */
- if (!clr) {
- put4(*crc, out);
- put4(*tot, out);
- }
- }
- /* join the gzip files on the command line, write result to stdout */
- int main(int argc, char **argv)
- {
- unsigned long crc, tot; /* running crc and total uncompressed length */
- /* skip command name */
- argc--;
- argv++;
- /* show usage if no arguments */
- if (argc == 0) {
- fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
- stderr);
- return 0;
- }
- /* join gzip files on command line and write to stdout */
- gzinit(&crc, &tot, stdout);
- while (argc--)
- gzcopy(*argv++, argc, &crc, &tot, stdout);
- /* done */
- return 0;
- }
|