HPCToolkit
compress_lzma.c
Go to the documentation of this file.
1 // -*-Mode: C++;-*-
2 
3 // * BeginRiceCopyright *****************************************************
4 //
5 // $HeadURL$
6 // $Id$
7 //
8 // --------------------------------------------------------------------------
9 // Part of HPCToolkit (hpctoolkit.org)
10 //
11 // Information about sources of support for research and development of
12 // HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
13 // --------------------------------------------------------------------------
14 //
15 // Copyright ((c)) 2002-2019, Rice University
16 // All rights reserved.
17 //
18 // Redistribution and use in source and binary forms, with or without
19 // modification, are permitted provided that the following conditions are
20 // met:
21 //
22 // * Redistributions of source code must retain the above copyright
23 // notice, this list of conditions and the following disclaimer.
24 //
25 // * Redistributions in binary form must reproduce the above copyright
26 // notice, this list of conditions and the following disclaimer in the
27 // documentation and/or other materials provided with the distribution.
28 //
29 // * Neither the name of Rice University (RICE) nor the names of its
30 // contributors may be used to endorse or promote products derived from
31 // this software without specific prior written permission.
32 //
33 // This software is provided by RICE and contributors "as is" and any
34 // express or implied warranties, including, but not limited to, the
35 // implied warranties of merchantability and fitness for a particular
36 // purpose are disclaimed. In no event shall RICE or contributors be
37 // liable for any direct, indirect, incidental, special, exemplary, or
38 // consequential damages (including, but not limited to, procurement of
39 // substitute goods or services; loss of use, data, or profits; or
40 // business interruption) however caused and on any theory of liability,
41 // whether in contract, strict liability, or tort (including negligence
42 // or otherwise) arising in any way out of the use of this software, even
43 // if advised of the possibility of such damage.
44 //
45 // ******************************************************* EndRiceCopyright *
46 
47 //***************************************************************************
48 //
49 // File:
50 // $HeadURL$
51 //
52 // Purpose:
53 // [The purpose of this file]
54 //
55 // Description:
56 // [The set of functions, macros, etc. defined in the file]
57 //
58 //***************************************************************************
59 
60 #include <stdbool.h>
61 #include <stdlib.h>
62 #include <stdio.h>
63 #include <string.h>
64 #include <errno.h>
65 #include <lzma.h>
66 
67 #include "compress.h"
68 
69 
70 //***************************************************************************
71 // forward declarations
72 //***************************************************************************
73 
74 static const char*
75 get_error_message(lzma_ret ret) __attribute__ ((unused));
76 
77 
78 
79 //***************************************************************************
80 // private operations
81 //***************************************************************************
82 
83 static lzma_ret
84 init_encoder(lzma_stream *strm, uint32_t preset)
85 {
86  // Initialize the encoder using a preset. Set the integrity to check
87  // to CRC64, which is the default in the xz command line tool. If
88  // the .xz file needs to be decompressed with XZ Embedded, use
89  // LZMA_CHECK_CRC32 instead.
90  lzma_ret ret = lzma_easy_encoder(strm, preset, LZMA_CHECK_CRC64);
91  return ret;
92 }
93 
94 
95 static const char*
96 get_error_message(lzma_ret ret)
97 {
98  // Return successfully if the initialization went fine.
99  if (ret == LZMA_OK)
100  return NULL;
101 
102  // Something went wrong. The possible errors are documented in
103  // lzma/container.h (src/liblzma/api/lzma/container.h in the source
104  // package or e.g. /usr/include/lzma/container.h depending on the
105  // install prefix).
106  const char *msg;
107  switch (ret) {
108  case LZMA_MEM_ERROR:
109  msg = "Memory allocation failed";
110  break;
111 
112  case LZMA_OPTIONS_ERROR:
113  msg = "Specified preset is not supported";
114  break;
115 
116  case LZMA_UNSUPPORTED_CHECK:
117  msg = "Specified integrity check is not supported";
118  break;
119 
120  case LZMA_FORMAT_ERROR:
121  // .xz magic bytes weren't found.
122  msg = "The input is not in the .xz format";
123  break;
124 
125  case LZMA_DATA_ERROR:
126  // This error is returned if the compressed
127  // or uncompressed size get near 8 EiB
128  // (2^63 bytes) because that's where the .xz
129  // file format size limits currently are.
130  // That is, the possibility of this error
131  // is mostly theoretical unless you are doing
132  // something very unusual.
133  //
134  // Note that strm->total_in and strm->total_out
135  // have nothing to do with this error. Changing
136  // those variables won't increase or decrease
137  // the chance of getting this error.
138  msg = "File size limits exceeded";
139  break;
140 
141  case LZMA_BUF_ERROR:
142  // Typically this error means that a valid
143  // file has got truncated, but it might also
144  // be a damaged part in the file that makes
145  // the decoder think the file is truncated.
146  // If you prefer, you can use the same error
147  // message for this as for LZMA_DATA_ERROR.
148  msg = "Compressed file is truncated or "
149  "otherwise corrupt";
150  break;
151 
152  default:
153  // This is most likely LZMA_PROG_ERROR, but
154  // if this program is buggy (or liblzma has
155  // a bug), it may be e.g. LZMA_BUF_ERROR or
156  // LZMA_OPTIONS_ERROR too.
157  //
158  // It is inconvenient to have a separate
159  // error message for errors that should be
160  // impossible to occur, but knowing the error
161  // code is important for debugging. That's why
162  // it is good to print the error code at least
163  // when there is no good error message to show.
164  msg = "Unknown error, possibly a bug";
165  break;
166  }
167  return msg;
168 }
169 
170 
171 static lzma_ret
172 compress(lzma_stream *strm, FILE *infile, FILE *outfile)
173 {
174  // This will be LZMA_RUN until the end of the input file is reached.
175  // This tells lzma_code() when there will be no more input.
176  lzma_action action = LZMA_RUN;
177 
178  // Buffers to temporarily hold uncompressed input
179  // and compressed output.
180  uint8_t inbuf[BUFSIZ];
181  uint8_t outbuf[BUFSIZ];
182 
183  // Initialize the input and output pointers. Initializing next_in
184  // and avail_in isn't really necessary when we are going to encode
185  // just one file since LZMA_STREAM_INIT takes care of initializing
186  // those already. But it doesn't hurt much and it will be needed
187  // if encoding more than one file like we will in 02_decompress.c.
188  //
189  // While we don't care about strm->total_in or strm->total_out in this
190  // example, it is worth noting that initializing the encoder will
191  // always reset total_in and total_out to zero. But the encoder
192  // initialization doesn't touch next_in, avail_in, next_out, or
193  // avail_out.
194  strm->next_in = NULL;
195  strm->avail_in = 0;
196  strm->next_out = outbuf;
197  strm->avail_out = sizeof(outbuf);
198 
199  // Loop until the file has been successfully compressed or until
200  // an error occurs.
201  while (true) {
202  // Fill the input buffer if it is empty.
203  if (strm->avail_in == 0 && !feof(infile)) {
204  strm->next_in = inbuf;
205  strm->avail_in = fread(inbuf, 1, sizeof(inbuf),
206  infile);
207 
208  if (ferror(infile)) {
209  return LZMA_PROG_ERROR;
210  }
211 
212  // Once the end of the input file has been reached,
213  // we need to tell lzma_code() that no more input
214  // will be coming and that it should finish the
215  // encoding.
216  if (feof(infile))
217  action = LZMA_FINISH;
218  }
219 
220  // Tell liblzma do the actual encoding.
221  //
222  // This reads up to strm->avail_in bytes of input starting
223  // from strm->next_in. avail_in will be decremented and
224  // next_in incremented by an equal amount to match the
225  // number of input bytes consumed.
226  //
227  // Up to strm->avail_out bytes of compressed output will be
228  // written starting from strm->next_out. avail_out and next_out
229  // will be incremented by an equal amount to match the number
230  // of output bytes written.
231  //
232  // The encoder has to do internal buffering, which means that
233  // it may take quite a bit of input before the same data is
234  // available in compressed form in the output buffer.
235  lzma_ret ret = lzma_code(strm, action);
236 
237  // If the output buffer is full or if the compression finished
238  // successfully, write the data from the output bufffer to
239  // the output file.
240  if (strm->avail_out == 0 || ret == LZMA_STREAM_END) {
241  // When lzma_code() has returned LZMA_STREAM_END,
242  // the output buffer is likely to be only partially
243  // full. Calculate how much new data there is to
244  // be written to the output file.
245  size_t write_size = sizeof(outbuf) - strm->avail_out;
246 
247  if (fwrite(outbuf, 1, write_size, outfile)
248  != write_size) {
249  return LZMA_PROG_ERROR;
250  }
251 
252  // Reset next_out and avail_out.
253  strm->next_out = outbuf;
254  strm->avail_out = sizeof(outbuf);
255  }
256 
257  // Normally the return value of lzma_code() will be LZMA_OK
258  // until everything has been encoded.
259  if (ret != LZMA_OK) {
260  // Once everything has been encoded successfully, the
261  // return value of lzma_code() will be LZMA_STREAM_END.
262  //
263  // It is important to check for LZMA_STREAM_END. Do not
264  // assume that getting ret != LZMA_OK would mean that
265  // everything has gone well.
266  if (ret == LZMA_STREAM_END)
267  return LZMA_OK;
268 
269  // It's not LZMA_OK nor LZMA_STREAM_END,
270  // so it must be an error code. See lzma/base.h
271  // (src/liblzma/api/lzma/base.h in the source package
272  // or e.g. /usr/include/lzma/base.h depending on the
273  // install prefix) for the list and documentation of
274  // possible values. Most values listen in lzma_ret
275  // enumeration aren't possible in this example.
276  return ret;
277  }
278  }
279 }
280 
281 
282 static lzma_ret
283 decompress(lzma_stream *strm, FILE *infile, FILE *outfile)
284 {
285  // When LZMA_CONCATENATED flag was used when initializing the decoder,
286  // we need to tell lzma_code() when there will be no more input.
287  // This is done by setting action to LZMA_FINISH instead of LZMA_RUN
288  // in the same way as it is done when encoding.
289  //
290  // When LZMA_CONCATENATED isn't used, there is no need to use
291  // LZMA_FINISH to tell when all the input has been read, but it
292  // is still OK to use it if you want. When LZMA_CONCATENATED isn't
293  // used, the decoder will stop after the first .xz stream. In that
294  // case some unused data may be left in strm->next_in.
295  lzma_action action = LZMA_RUN;
296 
297  uint8_t inbuf[BUFSIZ];
298  uint8_t outbuf[BUFSIZ];
299 
300  strm->next_in = NULL;
301  strm->avail_in = 0;
302  strm->next_out = outbuf;
303  strm->avail_out = sizeof(outbuf);
304 
305  while (true) {
306  if (strm->avail_in == 0 && !feof(infile)) {
307  strm->next_in = inbuf;
308  strm->avail_in = fread(inbuf, 1, sizeof(inbuf),
309  infile);
310 
311  if (ferror(infile)) {
312  return LZMA_PROG_ERROR;
313  }
314 
315  // Once the end of the input file has been reached,
316  // we need to tell lzma_code() that no more input
317  // will be coming. As said before, this isn't required
318  // if the LZMA_CONATENATED flag isn't used when
319  // initializing the decoder.
320  if (feof(infile))
321  action = LZMA_FINISH;
322  }
323 
324  lzma_ret ret = lzma_code(strm, action);
325 
326  if (strm->avail_out == 0 || ret == LZMA_STREAM_END) {
327  size_t write_size = sizeof(outbuf) - strm->avail_out;
328 
329  if (fwrite(outbuf, 1, write_size, outfile)
330  != write_size) {
331  return LZMA_BUF_ERROR;
332  }
333 
334  strm->next_out = outbuf;
335  strm->avail_out = sizeof(outbuf);
336  }
337 
338  if (ret != LZMA_OK) {
339  // Once everything has been decoded successfully, the
340  // return value of lzma_code() will be LZMA_STREAM_END.
341  //
342  // It is important to check for LZMA_STREAM_END. Do not
343  // assume that getting ret != LZMA_OK would mean that
344  // everything has gone well or that when you aren't
345  // getting more output it must have successfully
346  // decoded everything.
347  if (ret == LZMA_STREAM_END)
348  return LZMA_OK;
349 
350  return ret;
351  }
352  }
353 }
354 
355 
356 static lzma_ret
357 init_decoder(lzma_stream *strm)
358 {
359  // Initialize a .xz decoder. The decoder supports a memory usage limit
360  // and a set of flags.
361  //
362  // The memory usage of the decompressor depends on the settings used
363  // to compress a .xz file. It can vary from less than a megabyte to
364  // a few gigabytes, but in practice (at least for now) it rarely
365  // exceeds 65 MiB because that's how much memory is required to
366  // decompress files created with "xz -9". Settings requiring more
367  // memory take extra effort to use and don't (at least for now)
368  // provide significantly better compression in most cases.
369  //
370  // Memory usage limit is useful if it is important that the
371  // decompressor won't consume gigabytes of memory. The need
372  // for limiting depends on the application. In this example,
373  // no memory usage limiting is used. This is done by setting
374  // the limit to UINT64_MAX.
375  //
376  // The .xz format allows concatenating compressed files as is:
377  //
378  // echo foo | xz > foobar.xz
379  // echo bar | xz >> foobar.xz
380  //
381  // When decompressing normal standalone .xz files, LZMA_CONCATENATED
382  // should always be used to support decompression of concatenated
383  // .xz files. If LZMA_CONCATENATED isn't used, the decoder will stop
384  // after the first .xz stream. This can be useful when .xz data has
385  // been embedded inside another file format.
386  //
387  // Flags other than LZMA_CONCATENATED are supported too, and can
388  // be combined with bitwise-or. See lzma/container.h
389  // (src/liblzma/api/lzma/container.h in the source package or e.g.
390  // /usr/include/lzma/container.h depending on the install prefix)
391  // for details.
392  lzma_ret ret = lzma_stream_decoder(
393  strm, UINT64_MAX, LZMA_CONCATENATED);
394 
395  return ret;
396 }
397 
398 
399 
400 //***************************************************************************
401 // interface operations
402 //***************************************************************************
403 
404 /* Compress from file source to file dest until EOF on source.
405 It returns:
406  COMPRESS_OK on success,
407  COMPRESS_FAIL if the inflate data is invalid or the version is
408  incorrect,
409  COMPRESS_IO_ERROR is there is an error reading or writing the file
410 
411  The compression level must be Z_DEFAULT_COMPRESSION,
412  or between 0 and 9: 1 gives best speed, 9 gives best compression,
413  0 gives no compression at all (the input data is simply copied a
414  block at a time). Z_DEFAULT_COMPRESSION requests a default compromise
415  between speed and compression (currently equivalent to level 6).
416  */
417 enum compress_e
418 compress_deflate(FILE *source, FILE *dest, int level)
419 {
420 
421  // Initialize a lzma_stream structure. When it is allocated on stack,
422  // it is simplest to use LZMA_STREAM_INIT macro like below. When it
423  // is allocated on heap, using memset(strmptr, 0, sizeof(*strmptr))
424  // works (as long as NULL pointers are represented with zero bits
425  // as they are on practically all computers today).
426  lzma_stream strm = LZMA_STREAM_INIT;
427 
428  // Initialize the encoder. If it succeeds, compress from
429  // source to dest.
430  lzma_ret ret = init_encoder(&strm, (uint32_t)level);
431  if (ret == LZMA_OK)
432  ret = compress(&strm, source, dest);
433 
434  // Free the memory allocated for the encoder. If we were encoding
435  // multiple files, this would only need to be done after the last
436  // file. See 02_decompress.c for handling of multiple files.
437  //
438  // It is OK to call lzma_end() multiple times or when it hasn't been
439  // actually used except initialized with LZMA_STREAM_INIT.
440  lzma_end(&strm);
441 
442  if (ret == LZMA_OK)
443  return COMPRESS_OK;
444 
445  return COMPRESS_FAIL;
446 }
447 
448 
449 /* Decompress from file source to file dest until stream ends or EOF.
450  It returns:
451  COMPRESS_OK on success,
452  COMPRESS_FAIL if the deflate data is invalid or the version is
453  incorrect,
454  COMPRESS_IO_ERROR is there is an error reading or writing the file
455  COMPRESS_NONE if decompression is not needed.
456  */
457 enum compress_e
458 compress_inflate(FILE *source, FILE *dest)
459 {
460  lzma_stream strm = LZMA_STREAM_INIT;
461 
462  lzma_ret ret = init_decoder(&strm);
463  if (ret != LZMA_OK) {
464  // Decoder initialization failed. There's no point
465  // to retry it so we need to exit.
466  return COMPRESS_FAIL;
467  }
468  ret = decompress(&strm, source, dest);
469 
470  // Free the memory allocated for the decoder. This only needs to be
471  // done after the last file.
472  lzma_end(&strm);
473 
474  return ret == LZMA_OK ? COMPRESS_OK : COMPRESS_FAIL;
475 }
476 
477 
478 #ifdef __UNIT_TEST_COMPRESS__
479 #include <errno.h>
480 #include <unistd.h>
481 #include <stdlib.h>
482 int main(int argc, char *argv[])
483 {
484  if (argc < 3) {
485  printf("syntax: %s input_compressed_file output_file\n", argv[0]);
486  exit(0);
487  }
488  FILE *fp_in = fopen(argv[1], "r");
489  FILE *fp_out = fopen(argv[2], "wx");
490 
491  if (fp_in == NULL || fp_out == NULL) {
492  perror("fail to open file:");
493  }
494  // test 1: compressing file
495  int ret = compress_deflate(fp_in, fp_out, 1);
496  if (ret != COMPRESS_OK) {
497  printf("cannot compress %s into %s\n", argv[1], argv[2]);
498  perror("compress fail:");
499  }
500  fclose(fp_in);
501  fclose(fp_out);
502 
503  // test 2: decompressing file
504  fp_out = fopen(argv[2], "r");
505  FILE *fp_def = tmpfile();
506  ret = compress_inflate(fp_out, fp_def);
507  if (ret != COMPRESS_OK) {
508  printf("cannot decompress %s\n", argv[2]);
509  perror("compress fail:");
510  }
511 
512  // testing the output
513  char buffer[11];
514  fseek(fp_def, 0, SEEK_SET);
515  fgets(buffer, 10, fp_def);
516  buffer[10] = '\0';
517  printf("file: '%s'\n", buffer);
518 
519  fclose(fp_out);
520  fclose(fp_def);
521 
522 }
523 #endif
enum compress_e compress_deflate(FILE *source, FILE *dest, int level)
enum compress_e compress_inflate(FILE *source, FILE *dest)
static lzma_ret init_encoder(lzma_stream *strm, uint32_t preset)
Definition: compress_lzma.c:84
exit
Definition: names.cpp:1
static lzma_ret init_decoder(lzma_stream *strm)
size_t MONITOR_EXT_WRAP_NAME() fread(void *ptr, size_t size, size_t count, FILE *stream)
Definition: io-over.c:226
void __attribute__((weak))
Definition: hpctoolkit.c:64
int main(int argc, char *argv[])
Definition: main.cpp:125
static lzma_ret compress(lzma_stream *strm, FILE *infile, FILE *outfile)
static const char * get_error_message(lzma_ret ret) __attribute__((unused))
Definition: compress_lzma.c:96
#define NULL
Definition: ElfHelper.cpp:85
size_t MONITOR_EXT_WRAP_NAME() fwrite(const void *ptr, size_t size, size_t count, FILE *stream)
Definition: io-over.c:260
compress_e
Definition: compress.h:73
static lzma_ret decompress(lzma_stream *strm, FILE *infile, FILE *outfile)
static char * inbuf
Definition: server.cpp:112