bmzip.c

Go to the documentation of this file.
00001 
00022 #include "Common/compat-c.h"
00023 #include <stdio.h>
00024 #include <stdlib.h>
00025 #include <string.h>
00026 #include <unistd.h>
00027 #include <limits.h>
00028 #include <stdint.h>
00029 #include <stdarg.h>
00030 #include <errno.h>
00031 #include <fcntl.h>
00032 #include <sys/stat.h>
00033 #ifndef HT_NO_MMAP
00034 #  include <sys/mman.h>
00035 static int s_no_mmap = 0;
00036 #else
00037 static int s_no_mmap = 1;
00038 #endif
00039 
00040 #include "bmz-internal.h"
00041 
00042 #define BMZ_MAGIC       "BMZ"
00043 #define BMZIP_VER       0x0110
00044 #define BMZ_HEADER_SZ   (strlen(BMZ_MAGIC) + 2 + 1 + 6 + 4)
00045 
00046 #define BMZ_A_PACK      0
00047 #define BMZ_A_UNPACK    1
00048 #define BMZ_A_LIST      2
00049 
00050 #define BMZ_O_BM_ONLY   1
00051 #define BMZ_O_STREAM    2       /* TODO */
00052 
00053 typedef unsigned char Byte;
00054 
00055 /* To silence warnings in format strings */
00056 typedef long long unsigned Llu;
00057 typedef long unsigned Lu;
00058 
00059 static int s_verbosity = 0;
00060 static int s_bm_dump = 0;
00061 static int s_bm_hash = 0;
00062 
00063 #define LOG(_lvl_, _fmt_, ...) if (s_verbosity >= _lvl_) do { \
00064   fprintf(stderr, "bmzip: %s: " _fmt_, __FUNCTION__, ##__VA_ARGS__); \
00065   if (errno) fprintf(stderr, ": %s", strerror(errno)); \
00066   putc('\n', stderr); \
00067 } while (0)
00068 
00069 #define WARN(_fmt_, ...) do { \
00070   LOG(0, "warning: " _fmt_, ##__VA_ARGS__); \
00071 } while (0)
00072 
00073 #define DIE(_fmt_, ...) do { \
00074   LOG(0, "fatal: " _fmt_, ##__VA_ARGS__); \
00075   exit(1); \
00076 } while (0)
00077 
00078 #define BMZ_ALIGN(_mem_, _n_) (Byte *)(_mem_) + _n_ - (((size_t)(_mem_))%(_n_))
00079 
00080 #define BMZ_READ_INT16(_p_, _n_) \
00081   _n_ = (*_p_++ << 8); \
00082   _n_ |= (*_p_++)
00083 
00084 #define BMZ_READ_INT32(_p_, _n_) \
00085   _n_ = (*_p_++ << 24); \
00086   _n_ |= (*_p_++ << 16); \
00087   _n_ |= (*_p_++ << 8); \
00088   _n_ |= (*_p_++)
00089 
00090 #define BMZ_READ_INT48(_p_, _n_) \
00091   _n_ = ((uint64_t)*_p_++ << 40); \
00092   _n_ |= ((uint64_t)*_p_++ << 32); \
00093   _n_ |= (*_p_++ << 24); \
00094   _n_ |= (*_p_++ << 16); \
00095   _n_ |= (*_p_++ << 8); \
00096   _n_ |= (*_p_++)
00097 
00098 #define BMZ_WRITE_INT16(_p_, _n_) \
00099   *_p_++ = (Byte)(_n_ >> 8); \
00100   *_p_++ = (Byte)(_n_)
00101 
00102 #define BMZ_WRITE_INT32(_p_, _n_) \
00103   *_p_++ = (Byte)(_n_ >> 24); \
00104   *_p_++ = (Byte)(_n_ >> 16); \
00105   *_p_++ = (Byte)(_n_ >> 8); \
00106   *_p_++ = (Byte)(_n_)
00107 
00108 #define BMZ_WRITE_INT48(_p_, _n_) \
00109   *_p_++ = (Byte)(_n_ >> 40); \
00110   *_p_++ = (Byte)(_n_ >> 32); \
00111   *_p_++ = (Byte)(_n_ >> 24); \
00112   *_p_++ = (Byte)(_n_ >> 16); \
00113   *_p_++ = (Byte)(_n_ >> 8); \
00114   *_p_++ = (Byte)(_n_)
00115 
00116 static void
00117 read_bmz_header(int fd, Byte *buf) {
00118   if (read(fd, buf, BMZ_HEADER_SZ) != BMZ_HEADER_SZ)
00119     DIE("error reading bmz file header (%lu bytes)", (Lu)BMZ_HEADER_SZ);
00120 }
00121 
00122 static void
00123 parse_bmz_header(const Byte *buf, uint16_t *version_p, uint64_t *orig_size_p,
00124                 uint32_t *checksum_p, uint32_t *options) {
00125   const Byte *bp = buf;
00126   size_t magic_len = strlen(BMZ_MAGIC);
00127 
00128   if (memcmp(buf, BMZ_MAGIC, magic_len)) {
00129     DIE("bad magic in file header (%lu bytes)", (Lu)magic_len);
00130   }
00131   bp += magic_len;
00132   BMZ_READ_INT16(bp, *version_p);
00133 
00134   if (*version_p > BMZIP_VER)
00135     DIE("incomaptible version: %04x", *version_p);
00136 
00137   *options = *bp++;
00138   BMZ_READ_INT48(bp, *orig_size_p);
00139   BMZ_READ_INT32(bp, *checksum_p);
00140 }
00141 
00142 static void
00143 write_bmz_header(int fd, size_t in_len, uint32_t checksum, Byte options) {
00144   char buf[BMZ_HEADER_SZ], *bp = buf;
00145   uint64_t orig_size = in_len;
00146 
00147   strcpy(buf, BMZ_MAGIC);
00148   bp += strlen(BMZ_MAGIC);
00149   BMZ_WRITE_INT16(bp, BMZIP_VER);
00150   *bp++ = options;
00151   BMZ_WRITE_INT48(bp, orig_size);
00152   BMZ_WRITE_INT32(bp, checksum);
00153 
00154   if (write(fd, buf, BMZ_HEADER_SZ) != BMZ_HEADER_SZ)
00155     DIE("error writing header (%lu bytes)", (Lu)BMZ_HEADER_SZ);
00156 }
00157 
00158 static void
00159 do_list(int fd) {
00160   Byte buf[BMZ_HEADER_SZ];
00161   uint16_t version;
00162   uint64_t orig_size, size;
00163   uint32_t checksum, options;
00164   struct stat st;
00165 
00166   if (fstat(fd, &st) != 0) DIE("error getting stat from file (%d)", fd);
00167 
00168   size = st.st_size;
00169   read_bmz_header(fd, buf);
00170   parse_bmz_header(buf, &version, &orig_size, &checksum, &options);
00171   printf("%8s%16s%16s%8s\n", "version", "compressed", "uncompressed", "ratio");
00172   printf("    %04x%16llu%16llu%7.2f%%\n", version, (Llu)size,
00173          (Llu)orig_size, orig_size ? size * 100. / orig_size : 1);
00174 }
00175 
00176 static void
00177 do_pack(const void *in, size_t in_len, size_t buf_len,
00178         size_t offset, size_t fp_len, Byte options) {
00179   size_t buflen = bmz_pack_buflen(in_len), out_len = buflen;
00180   size_t worklen = bmz_pack_worklen(in_len, fp_len);
00181   int ret, bm_only = (options & BMZ_O_BM_ONLY) || s_bm_dump;
00182   Byte *out, *work_mem;
00183 
00184   if (bm_only) {
00185     out_len = in_len + 1;
00186 
00187     if (buf_len > in_len + worklen) {
00188       out = (Byte *)in + in_len;
00189       work_mem = out + out_len;
00190     }
00191     else {
00192       out = malloc(worklen); /* bmz_pack_worklen includes out_len for bm */
00193 
00194       if (!out)
00195         DIE("error allocating %lu bytes memory", (Lu)worklen);
00196 
00197       work_mem = out + out_len;
00198     }
00199     /* calling internal API need to align work memory */
00200     work_mem = BMZ_ALIGN(work_mem, 8);
00201   }
00202   else if (buf_len > buflen + worklen) {
00203     work_mem = (Byte *)in + buflen;
00204     out = (Byte *)in; /* inplace */
00205   }
00206   else {
00207     out = malloc(buflen + worklen);
00208 
00209     if (!out)
00210       DIE("error allocating %lu bytes memory", (Lu)buflen + worklen);
00211 
00212     work_mem = out + buflen;
00213   }
00214 
00215   if (bm_only) {
00216     ret = bmz_bm_pack_mask(in, in_len, out, &out_len, offset, fp_len,
00217                            work_mem, 257);
00218     if (ret != BMZ_E_OK)
00219       DIE("error encoding bm output (error %d)", ret);
00220 
00221     if (s_bm_dump) {
00222       if ((ret = bmz_bm_dump(out, out_len)) != BMZ_E_OK)
00223         WARN("error dumping bm encoding (ret=%d)", ret);
00224 
00225       return;
00226     }
00227   }
00228   else if ((ret = bmz_pack(in, in_len, out, &out_len, offset, fp_len,
00229                            (s_bm_hash << 24), work_mem))
00230            != BMZ_E_OK) {
00231     DIE("error compressing input (error %d)", ret);
00232   }
00233   write_bmz_header(1, in_len, bmz_checksum(out, out_len), options);
00234   write(1, out, out_len);
00235 }
00236 
00237 static void
00238 do_unpack(const void *in, size_t in_len, size_t buf_len) {
00239   const Byte *bp = (Byte *)in;
00240   uint16_t version;
00241   uint64_t orig_size;
00242   uint32_t checksum, cs, options;
00243   size_t outlen, worklen, len = in_len - BMZ_HEADER_SZ;
00244   Byte *out, *workmem;
00245   int ret;
00246 
00247   if (in_len < BMZ_HEADER_SZ) DIE("file truncated (size: %lu)", (Lu)in_len);
00248 
00249   parse_bmz_header(bp, &version, &orig_size, &checksum, &options);
00250 
00251   if (orig_size > INT_MAX && sizeof(size_t) == 4)
00252     DIE("original file size %llu requires 64-bit version of bmzip",
00253         (Llu)orig_size);
00254 
00255   bp += BMZ_HEADER_SZ;
00256   buf_len -= BMZ_HEADER_SZ;
00257   cs = bmz_checksum(bp, len);
00258   outlen = orig_size;
00259 
00260   if (cs != checksum)
00261     DIE("checksum mismatch (expecting %x, got %x).", checksum, cs);
00262 
00263   if (options & BMZ_O_BM_ONLY) {
00264     out = buf_len > in_len + orig_size ? (Byte*)bp + len : malloc(outlen);
00265 
00266     if ((ret = bmz_bm_unpack(bp, len, out, &outlen)) != BMZ_E_OK)
00267       DIE("error decoding bm input (error %d)", ret);
00268   }
00269   else {
00270     worklen = bmz_unpack_worklen(orig_size > len ? orig_size : len);
00271     out = (buf_len > outlen + worklen) ? (Byte *)bp : malloc(outlen + worklen);
00272     workmem = out + outlen;
00273 
00274     if ((ret = bmz_unpack(bp, len, out, &outlen, workmem)) != BMZ_E_OK)
00275       DIE("error decompressing (error %d)", ret);
00276   }
00277   if (orig_size != outlen)
00278     WARN("size mismatch (expecting %llu, got %llu)",
00279          (Llu)orig_size, (Llu)outlen);
00280 
00281   write(1, out, outlen);
00282 }
00283 
00284 static void
00285 do_block(const void *in, size_t len, size_t buf_len, size_t offset,
00286          size_t fp_len, int action, int options) {
00287   switch (action) {
00288   case BMZ_A_PACK:
00289     do_pack(in, len, buf_len, offset, fp_len, options);
00290     break;
00291   case BMZ_A_UNPACK:
00292     do_unpack(in, len, buf_len);
00293     break;
00294   default:
00295     DIE("unknown action: %d", action);
00296   }
00297 }
00298 
00299 static char *
00300 read_from_fp(FILE *fp, size_t *len_p, size_t *size_p) {
00301   char *data = NULL;
00302   char buf[65536];
00303   int64_t len = 0, size = 0, ret;
00304 
00305   while ((ret = fread(buf, 1, sizeof(buf), fp)) > 0) {
00306     len += ret;
00307     if (len > INT_MAX)
00308       DIE("reading from stdin for data size greater than 2GB "
00309           "not yet supported (current size: %lld)", (long long)len);
00310 
00311     if (len > size) {
00312       size = (len + 16) * 5 / 2;
00313       data = realloc(data, size);
00314     }
00315     memcpy(data + len - ret, buf, ret);
00316   }
00317   *len_p = len;
00318   *size_p = size;
00319   return data;
00320 }
00321 
00322 static char *
00323 read_from_fd(int fd, size_t *len_p, size_t *size_p) {
00324   struct stat st;
00325   void *data = NULL;
00326   size_t sz;
00327 
00328   if (fstat(fd, &st) != 0) DIE("cannot stat fd <%d>", fd);
00329 
00330   if (st.st_size > INT_MAX && sizeof(size_t) == 4)
00331     DIE("file size %llu requires 64-bit version of bmzip",
00332         (Llu)st.st_size);
00333 
00334   sz = *len_p = *size_p = st.st_size;
00335 
00336   if (!sz) return data;
00337 
00338   if (!s_no_mmap) {
00339 #ifndef HT_NO_MMAP
00340     LOG(1, "mmapping file (size: %lu)...", (Lu)sz);
00341     data = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0);
00342 
00343     if (!data || (void *)-1 == data) {
00344       LOG(1, "mmap failed on fd %d", fd);
00345       errno = 0;
00346       LOG(1, "%s", "trying alternative");
00347       data = NULL;
00348     }
00349 #endif
00350   }
00351   if (!data) {
00352     LOG(1, "reading file (size: %lu) into memory...", (Lu)sz);
00353     data = malloc(sz);
00354 
00355     if (!data) DIE("cannot allocate %lu bytes memory", (Lu)sz);
00356 
00357     if (read(fd, data, sz) != sz) DIE("error reading %lu bytes", (Lu)sz);
00358   }
00359 
00360   return data;
00361 }
00362 
00363 static void
00364 input_from_stdin(size_t offset, size_t fp_len, int action, int options) {
00365   size_t len, buf_len;
00366 
00367   if (action == BMZ_A_LIST) {
00368     do_list(0);
00369   }
00370   else {
00371     void *data = read_from_fp(stdin, &len, &buf_len);
00372     do_block(data, len, buf_len, offset, fp_len, action, options);
00373   }
00374 }
00375 
00376 static void
00377 input_from_file(const char *fname, size_t offset, size_t fp_len, int action,
00378                 int options) {
00379   size_t len, buf_len;
00380   int fd = open(fname, O_RDONLY, 0);
00381 
00382   if (fd == -1) DIE("cannot open '%s'", fname);
00383 
00384   if (action == BMZ_A_LIST) {
00385     do_list(fd);
00386   }
00387   else {
00388     void *data = read_from_fd(fd, &len, &buf_len);
00389     do_block(data, len, buf_len, offset, fp_len, action, options);
00390   }
00391   /* close and free etc. are omitted intentionally */
00392 }
00393 
00394 static int
00395 bm_hash(const char *name) {
00396 
00397   if (!strcmp("mod", name))             return BMZ_HASH_MOD;
00398   else if (!strcmp("mod16x2", name))    return BMZ_HASH_MOD16X2;
00399   else if (!strcmp("mask16x2", name))   return BMZ_HASH_MASK16X2;
00400   else if (!strcmp("mask", name))       return BMZ_HASH_MASK;
00401   else if (!strcmp("mask32x2", name))   return BMZ_HASH_MASK32X2;
00402 
00403   DIE("unknown hash: %s", name);
00404   return 0;
00405 }
00406 
00407 static void HT_NORETURN
00408 show_usage() {
00409   fprintf(stderr, "%s%s", /* c89 string literal limit is 509 */
00410     "usage: bmzip [options] [<file>]\n"
00411     "-d, --decompress         decompress to stdout\n"
00412     "--verbose[=level]        show some diagnostic messages\n"
00413     "-l, --list               list compressed file info\n"
00414     "-h, --help               show this message\n"
00415     "--offset    <number>     expert: bm encoding start offset\n"
00416     "--fp-len    <number>     expert: bm encoding fingerprint size\n"
00417     "--bm-thresh <number>     expert: bm hash collision threshold\n",
00418     "--bm-hash   <name>       expert: use <name> as bm hash\n"
00419     "--bm-only                expert: skip lz compression\n"
00420     "--bm-dump                expert: dump human readable bm encoding\n"
00421     "--no-mmap                expert: do not use mmap\n");
00422   exit(0);
00423 }
00424 
00425 int
00426 main(int ac, char *av[]) {
00427   char **ia = av + 1, **a_end = av + ac;
00428   /* defaults */
00429   size_t fp_len = 64, offset = 0;
00430   int bm_thresh = 0, action = BMZ_A_PACK, options = 0;
00431 
00432   for (; ia < a_end; ++ia) {
00433     if (!strcmp("-d", *ia) ||
00434         !strcmp("--decompress", *ia))           action = BMZ_A_UNPACK;
00435     else if (!strcmp("--verbose", *ia))         s_verbosity = 1;
00436     else if (!strcmp("--verbose=", *ia))        s_verbosity = atoi(*ia + 9);
00437     else if (!strcmp("--offset", *ia))          offset = atoi(*++ia);
00438     else if (!strcmp("--fp-len", *ia))          fp_len = atoi(*++ia);
00439     else if (!strcmp("--bm-only", *ia))         options |= BMZ_O_BM_ONLY;
00440     else if (!strcmp("--bm-dump", *ia))         s_bm_dump = 1;
00441     else if (!strcmp("--no-mmap", *ia))         s_no_mmap = 1;
00442     else if (!strcmp("--bm-thresh", *ia))       bm_thresh = atoi(*++ia);
00443     else if (!strcmp("--bm-hash", *ia))         s_bm_hash = bm_hash(*++ia);
00444     else if (!strcmp("-l", *ia) ||
00445              !strcmp("--list", *ia))            action = BMZ_A_LIST;
00446     else if (!strcmp("-h", *ia) ||
00447              !strcmp("--help", *ia)) {
00448       show_usage();
00449     }
00450     else if (!strcmp("--version", *ia)) {
00451       LOG(0, "version %d.%d.%d.%d", BMZIP_VER >> 12, (BMZIP_VER >> 8) & 0xf,
00452           (BMZIP_VER >> 4) & 0xf, BMZIP_VER & 0xf);
00453       exit(0);
00454     }
00455     else if (!strcmp("--", *ia)) {
00456       ++ia;
00457       break;
00458     }
00459     else if ('-' == **ia)
00460       DIE("unknown option: %s\n", *ia);
00461     else break;
00462   }
00463   if (s_verbosity)
00464     bmz_set_verbosity(s_verbosity);
00465 
00466   if (bm_thresh)
00467     bmz_set_collision_thresh(bm_thresh);
00468 
00469   if (ia >= a_end)
00470     input_from_stdin(offset, fp_len, action, options);
00471   else
00472     input_from_file(*ia, offset, fp_len, action, options);
00473 
00474   return 0;
00475 }