Biomedical Image Analysis Library
The Biomedical Image Analysis Library is a poweful tool for developers, physicians, researchers, engineers, and so on.
gzjoin.c
Go to the documentation of this file.
1 /* gzjoin -- command to join gzip files into one gzip file
2 
3  Copyright (C) 2004, 2005, 2012 Mark Adler, all rights reserved
4  version 1.2, 14 Aug 2012
5 
6  This software is provided 'as-is', without any express or implied
7  warranty. In no event will the author be held liable for any damages
8  arising from the use of this software.
9 
10  Permission is granted to anyone to use this software for any purpose,
11  including commercial applications, and to alter it and redistribute it
12  freely, subject to the following restrictions:
13 
14  1. The origin of this software must not be misrepresented; you must not
15  claim that you wrote the original software. If you use this software
16  in a product, an acknowledgment in the product documentation would be
17  appreciated but is not required.
18  2. Altered source versions must be plainly marked as such, and must not be
19  misrepresented as being the original software.
20  3. This notice may not be removed or altered from any source distribution.
21 
22  Mark Adler madler@alumni.caltech.edu
23  */
24 
25 /*
26  * Change history:
27  *
28  * 1.0 11 Dec 2004 - First version
29  * 1.1 12 Jun 2005 - Changed ssize_t to long for portability
30  * 1.2 14 Aug 2012 - Clean up for z_const usage
31  */
32 
33 /*
34  gzjoin takes one or more gzip files on the command line and writes out a
35  single gzip file that will uncompress to the concatenation of the
36  uncompressed data from the individual gzip files. gzjoin does this without
37  having to recompress any of the data and without having to calculate a new
38  crc32 for the concatenated uncompressed data. gzjoin does however have to
39  decompress all of the input data in order to find the bits in the compressed
40  data that need to be modified to concatenate the streams.
41 
42  gzjoin does not do an integrity check on the input gzip files other than
43  checking the gzip header and decompressing the compressed data. They are
44  otherwise assumed to be complete and correct.
45 
46  Each joint between gzip files removes at least 18 bytes of previous trailer
47  and subsequent header, and inserts an average of about three bytes to the
48  compressed data in order to connect the streams. The output gzip file
49  has a minimal ten-byte gzip header with no file name or modification time.
50 
51  This program was written to illustrate the use of the Z_BLOCK option of
52  inflate() and the crc32_combine() function. gzjoin will not compile with
53  versions of zlib earlier than 1.2.3.
54  */
55 
56 #include <stdio.h> /* fputs(), fprintf(), fwrite(), putc() */
57 #include <stdlib.h> /* exit(), malloc(), free() */
58 #include <fcntl.h> /* open() */
59 #include <unistd.h> /* close(), read(), lseek() */
60 #include "zlib.h"
61  /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
62 
63 #define local static
64 
65 /* exit with an error (return a value to allow use in an expression) */
66 local int bail(char *why1, char *why2)
67 {
68  fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
69  exit(1);
70  return 0;
71 }
72 
73 /* -- simple buffered file input with access to the buffer -- */
74 
75 #define CHUNK 32768 /* must be a power of two and fit in unsigned */
76 
77 /* bin buffered input file type */
78 typedef struct {
79  char *name; /* name of file for error messages */
80  int fd; /* file descriptor */
81  unsigned left; /* bytes remaining at next */
82  unsigned char *next; /* next byte to read */
83  unsigned char *buf; /* allocated buffer of length CHUNK */
84 } bin;
85 
86 /* close a buffered file and free allocated memory */
88 {
89  if (in != NULL) {
90  if (in->fd != -1)
91  close(in->fd);
92  if (in->buf != NULL)
93  free(in->buf);
94  free(in);
95  }
96 }
97 
98 /* open a buffered file for input, return a pointer to type bin, or NULL on
99  failure */
100 local bin *bopen(char *name)
101 {
102  bin *in;
103 
104  in = malloc(sizeof(bin));
105  if (in == NULL)
106  return NULL;
107  in->buf = malloc(CHUNK);
108  in->fd = open(name, O_RDONLY, 0);
109  if (in->buf == NULL || in->fd == -1) {
110  bclose(in);
111  return NULL;
112  }
113  in->left = 0;
114  in->next = in->buf;
115  in->name = name;
116  return in;
117 }
118 
119 /* load buffer from file, return -1 on read error, 0 or 1 on success, with
120  1 indicating that end-of-file was reached */
122 {
123  long len;
124 
125  if (in == NULL)
126  return -1;
127  if (in->left != 0)
128  return 0;
129  in->next = in->buf;
130  do {
131  len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left);
132  if (len < 0)
133  return -1;
134  in->left += (unsigned)len;
135  } while (len != 0 && in->left < CHUNK);
136  return len == 0 ? 1 : 0;
137 }
138 
139 /* get a byte from the file, bail if end of file */
140 #define bget(in) (in->left ? 0 : bload(in), \
141  in->left ? (in->left--, *(in->next)++) : \
142  bail("unexpected end of file on ", in->name))
143 
144 /* get a four-byte little-endian unsigned integer from file */
145 local unsigned long bget4(bin *in)
146 {
147  unsigned long val;
148 
149  val = bget(in);
150  val += (unsigned long)(bget(in)) << 8;
151  val += (unsigned long)(bget(in)) << 16;
152  val += (unsigned long)(bget(in)) << 24;
153  return val;
154 }
155 
156 /* skip bytes in file */
157 local void bskip(bin *in, unsigned skip)
158 {
159  /* check pointer */
160  if (in == NULL)
161  return;
162 
163  /* easy case -- skip bytes in buffer */
164  if (skip <= in->left) {
165  in->left -= skip;
166  in->next += skip;
167  return;
168  }
169 
170  /* skip what's in buffer, discard buffer contents */
171  skip -= in->left;
172  in->left = 0;
173 
174  /* seek past multiples of CHUNK bytes */
175  if (skip > CHUNK) {
176  unsigned left;
177 
178  left = skip & (CHUNK - 1);
179  if (left == 0) {
180  /* exact number of chunks: seek all the way minus one byte to check
181  for end-of-file with a read */
182  lseek(in->fd, skip - 1, SEEK_CUR);
183  if (read(in->fd, in->buf, 1) != 1)
184  bail("unexpected end of file on ", in->name);
185  return;
186  }
187 
188  /* skip the integral chunks, update skip with remainder */
189  lseek(in->fd, skip - left, SEEK_CUR);
190  skip = left;
191  }
192 
193  /* read more input and skip remainder */
194  bload(in);
195  if (skip > in->left)
196  bail("unexpected end of file on ", in->name);
197  in->left -= skip;
198  in->next += skip;
199 }
200 
201 /* -- end of buffered input functions -- */
202 
203 /* skip the gzip header from file in */
205 {
206  int flags;
207 
208  /* verify gzip magic header and compression method */
209  if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
210  bail(in->name, " is not a valid gzip file");
211 
212  /* get and verify flags */
213  flags = bget(in);
214  if ((flags & 0xe0) != 0)
215  bail("unknown reserved bits set in ", in->name);
216 
217  /* skip modification time, extra flags, and os */
218  bskip(in, 6);
219 
220  /* skip extra field if present */
221  if (flags & 4) {
222  unsigned len;
223 
224  len = bget(in);
225  len += (unsigned)(bget(in)) << 8;
226  bskip(in, len);
227  }
228 
229  /* skip file name if present */
230  if (flags & 8)
231  while (bget(in) != 0)
232  ;
233 
234  /* skip comment if present */
235  if (flags & 16)
236  while (bget(in) != 0)
237  ;
238 
239  /* skip header crc if present */
240  if (flags & 2)
241  bskip(in, 2);
242 }
243 
244 /* write a four-byte little-endian unsigned integer to out */
245 local void put4(unsigned long val, FILE *out)
246 {
247  putc(val & 0xff, out);
248  putc((val >> 8) & 0xff, out);
249  putc((val >> 16) & 0xff, out);
250  putc((val >> 24) & 0xff, out);
251 }
252 
253 /* Load up zlib stream from buffered input, bail if end of file */
254 local void zpull(z_streamp strm, bin *in)
255 {
256  if (in->left == 0)
257  bload(in);
258  if (in->left == 0)
259  bail("unexpected end of file on ", in->name);
260  strm->avail_in = in->left;
261  strm->next_in = in->next;
262 }
263 
264 /* Write header for gzip file to out and initialize trailer. */
265 local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
266 {
267  fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
268  *crc = crc32(0L, Z_NULL, 0);
269  *tot = 0;
270 }
271 
272 /* Copy the compressed data from name, zeroing the last block bit of the last
273  block if clr is true, and adding empty blocks as needed to get to a byte
274  boundary. If clr is false, then the last block becomes the last block of
275  the output, and the gzip trailer is written. crc and tot maintains the
276  crc and length (modulo 2^32) of the output for the trailer. The resulting
277  gzip file is written to out. gzinit() must be called before the first call
278  of gzcopy() to write the gzip header and to initialize crc and tot. */
279 local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
280  FILE *out)
281 {
282  int ret; /* return value from zlib functions */
283  int pos; /* where the "last block" bit is in byte */
284  int last; /* true if processing the last block */
285  bin *in; /* buffered input file */
286  unsigned char *start; /* start of compressed data in buffer */
287  unsigned char *junk; /* buffer for uncompressed data -- discarded */
288  z_off_t len; /* length of uncompressed data (support > 4 GB) */
289  z_stream strm; /* zlib inflate stream */
290 
291  /* open gzip file and skip header */
292  in = bopen(name);
293  if (in == NULL)
294  bail("could not open ", name);
295  gzhead(in);
296 
297  /* allocate buffer for uncompressed data and initialize raw inflate
298  stream */
299  junk = malloc(CHUNK);
300  strm.zalloc = Z_NULL;
301  strm.zfree = Z_NULL;
302  strm.opaque = Z_NULL;
303  strm.avail_in = 0;
304  strm.next_in = Z_NULL;
305  ret = inflateInit2(&strm, -15);
306  if (junk == NULL || ret != Z_OK)
307  bail("out of memory", "");
308 
309  /* inflate and copy compressed data, clear last-block bit if requested */
310  len = 0;
311  zpull(&strm, in);
312  start = in->next;
313  last = start[0] & 1;
314  if (last && clr)
315  start[0] &= ~1;
316  strm.avail_out = 0;
317  for (;;) {
318  /* if input used and output done, write used input and get more */
319  if (strm.avail_in == 0 && strm.avail_out != 0) {
320  fwrite(start, 1, strm.next_in - start, out);
321  start = in->buf;
322  in->left = 0;
323  zpull(&strm, in);
324  }
325 
326  /* decompress -- return early when end-of-block reached */
327  strm.avail_out = CHUNK;
328  strm.next_out = junk;
329  ret = inflate(&strm, Z_BLOCK);
330  switch (ret) {
331  case Z_MEM_ERROR:
332  bail("out of memory", "");
333  case Z_DATA_ERROR:
334  bail("invalid compressed data in ", in->name);
335  }
336 
337  /* update length of uncompressed data */
338  len += CHUNK - strm.avail_out;
339 
340  /* check for block boundary (only get this when block copied out) */
341  if (strm.data_type & 128) {
342  /* if that was the last block, then done */
343  if (last)
344  break;
345 
346  /* number of unused bits in last byte */
347  pos = strm.data_type & 7;
348 
349  /* find the next last-block bit */
350  if (pos != 0) {
351  /* next last-block bit is in last used byte */
352  pos = 0x100 >> pos;
353  last = strm.next_in[-1] & pos;
354  if (last && clr)
355  in->buf[strm.next_in - in->buf - 1] &= ~pos;
356  }
357  else {
358  /* next last-block bit is in next unused byte */
359  if (strm.avail_in == 0) {
360  /* don't have that byte yet -- get it */
361  fwrite(start, 1, strm.next_in - start, out);
362  start = in->buf;
363  in->left = 0;
364  zpull(&strm, in);
365  }
366  last = strm.next_in[0] & 1;
367  if (last && clr)
368  in->buf[strm.next_in - in->buf] &= ~1;
369  }
370  }
371  }
372 
373  /* update buffer with unused input */
374  in->left = strm.avail_in;
375  in->next = in->buf + (strm.next_in - in->buf);
376 
377  /* copy used input, write empty blocks to get to byte boundary */
378  pos = strm.data_type & 7;
379  fwrite(start, 1, in->next - start - 1, out);
380  last = in->next[-1];
381  if (pos == 0 || !clr)
382  /* already at byte boundary, or last file: write last byte */
383  putc(last, out);
384  else {
385  /* append empty blocks to last byte */
386  last &= ((0x100 >> pos) - 1); /* assure unused bits are zero */
387  if (pos & 1) {
388  /* odd -- append an empty stored block */
389  putc(last, out);
390  if (pos == 1)
391  putc(0, out); /* two more bits in block header */
392  fwrite("\0\0\xff\xff", 1, 4, out);
393  }
394  else {
395  /* even -- append 1, 2, or 3 empty fixed blocks */
396  switch (pos) {
397  case 6:
398  putc(last | 8, out);
399  last = 0;
400  case 4:
401  putc(last | 0x20, out);
402  last = 0;
403  case 2:
404  putc(last | 0x80, out);
405  putc(0, out);
406  }
407  }
408  }
409 
410  /* update crc and tot */
411  *crc = crc32_combine(*crc, bget4(in), len);
412  *tot += (unsigned long)len;
413 
414  /* clean up */
415  inflateEnd(&strm);
416  free(junk);
417  bclose(in);
418 
419  /* write trailer if this is the last gzip file */
420  if (!clr) {
421  put4(*crc, out);
422  put4(*tot, out);
423  }
424 }
425 
426 /* join the gzip files on the command line, write result to stdout */
427 int main(int argc, char **argv)
428 {
429  unsigned long crc, tot; /* running crc and total uncompressed length */
430 
431  /* skip command name */
432  argc--;
433  argv++;
434 
435  /* show usage if no arguments */
436  if (argc == 0) {
437  fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
438  stderr);
439  return 0;
440  }
441 
442  /* join gzip files on command line and write to stdout */
443  gzinit(&crc, &tot, stdout);
444  while (argc--)
445  gzcopy(*argv++, argc, &crc, &tot, stdout);
446 
447  /* done */
448  return 0;
449 }
unsigned char * next
Definition: gzjoin.c:82
#define Z_BLOCK
Definition: zlib.h:169
static void zpull(z_streamp strm, bin *in)
Definition: gzjoin.c:254
voidpf opaque
Definition: zlib.h:99
static void bskip(bin *in, unsigned skip)
Definition: gzjoin.c:157
static int bload(bin *in)
Definition: gzjoin.c:121
char * name
Definition: gzjoin.c:79
#define local
Definition: gzjoin.c:63
static void gzhead(bin *in)
Definition: gzjoin.c:204
free_func zfree
Definition: zlib.h:98
#define z_off_t
Definition: zconf.h:481
unsigned char * buf
Definition: gzjoin.c:83
int main(int argc, char **argv)
Definition: gzjoin.c:427
static int bail(char *why1, char *why2)
Definition: gzjoin.c:66
int data_type
Definition: zlib.h:101
static unsigned long bget4(bin *in)
Definition: gzjoin.c:145
static void skip(file *in, unsigned n)
Definition: gzappend.c:202
unsigned left
Definition: gzjoin.c:81
void free()
alloc_func zalloc
Definition: zlib.h:97
static void bclose(bin *in)
Definition: gzjoin.c:87
unsigned long crc32(unsigned long crc, unsigned char *buf, uInt len)
Definition: crc32.c:204
int fd
Definition: gzjoin.c:80
Bytef * next_in
Definition: zlib.h:86
#define inflateInit2(strm, windowBits)
Definition: zlib.h:1654
#define bget(in)
Definition: gzjoin.c:140
#define Z_DATA_ERROR
Definition: zlib.h:178
static int out(void *out_desc, unsigned char *buf, unsigned len)
Definition: gun.c:131
static void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot, FILE *out)
Definition: gzjoin.c:279
static bin * bopen(char *name)
Definition: gzjoin.c:100
#define SEEK_CUR
Definition: zip.c:80
Definition: zlib.h:85
Definition: gzjoin.c:78
int read(izstream &zs, T *x, Items items)
Definition: zstream.h:115
#define Z_MEM_ERROR
Definition: zlib.h:179
static void put4(unsigned long val, FILE *out)
Definition: gzjoin.c:245
uInt avail_out
Definition: zlib.h:91
uLong crc32_combine(uLong crc1, uLong crc2, long len2)
Definition: crc32.c:411
static unsigned in(void *in_desc, z_const unsigned char **buf)
Definition: gun.c:89
#define Z_OK
Definition: zlib.h:173
int inflateEnd(z_streamp strm)
Definition: inflate.c:1254
int inflate(z_streamp strm, int flush)
Definition: inflate.c:605
Bytef * next_out
Definition: zlib.h:90
#define Z_NULL
Definition: zlib.h:208
#define CHUNK
Definition: gzjoin.c:75
uInt avail_in
Definition: zlib.h:87
voidp malloc()
static void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
Definition: gzjoin.c:265