LRez  v2.1
gzIndex.h
Go to the documentation of this file.
1 // gzindex is built above zran, as provided by Mark Adler.
2 // Full license follow.
3 /* zran.c -- example of zlib/gzip stream indexing and random access
4  * Copyright (C) 2005, 2012 Mark Adler
5  * For conditions of distribution and use, see copyright notice in zlib.h
6  Version 1.1 29 Sep 2012 Mark Adler */
7 
8 /* Version History:
9  1.0 29 May 2005 First version
10  1.1 29 Sep 2012 Fix memory reallocation error
11  */
12 
13 /* Illustrate the use of Z_BLOCK, inflatePrime(), and inflateSetDictionary()
14  for random access of a compressed file. A file containing a zlib or gzip
15  stream is provided on the command line. The compressed stream is decoded in
16  its entirety, and an index built with access points about every SPAN bytes
17  in the uncompressed output. The compressed file is left open, and can then
18  be read randomly, having to decompress on the average SPAN/2 uncompressed
19  bytes before getting to the desired block of data.
20 
21  An access point can be created at the start of any deflate block, by saving
22  the starting file offset and bit of that block, and the 32K bytes of
23  uncompressed data that precede that block. Also the uncompressed offset of
24  that block is saved to provide a referece for locating a desired starting
25  point in the uncompressed stream. build_index() works by decompressing the
26  input zlib or gzip stream a block at a time, and at the end of each block
27  deciding if enough uncompressed data has gone by to justify the creation of
28  a new access point. If so, that point is saved in a data structure that
29  grows as needed to accommodate the points.
30 
31  To use the index, an offset in the uncompressed data is provided, for which
32  the latest access point at or preceding that offset is located in the index.
33  The input file is positioned to the specified location in the index, and if
34  necessary the first few bits of the compressed data is read from the file.
35  inflate is initialized with those bits and the 32K of uncompressed data, and
36  the decompression then proceeds until the desired offset in the file is
37  reached. Then the decompression continues to read the desired uncompressed
38  data from the file.
39 
40  Another approach would be to generate the index on demand. In that case,
41  requests for random access reads from the compressed data would try to use
42  the index, but if a read far enough past the end of the index is required,
43  then further index entries would be generated and added.
44 
45  There is some fair bit of overhead to starting inflation for the random
46  access, mainly copying the 32K byte dictionary. So if small pieces of the
47  file are being accessed, it would make sense to implement a cache to hold
48  some lookahead and avoid many calls to extract() for small lengths.
49 
50  Another way to build an index would be to use inflateCopy(). That would
51  not be constrained to have access points at block boundaries, but requires
52  more memory per access point, and also cannot be saved to file due to the
53  use of pointers in the state. The approach here allows for storage of the
54  index in a file.
55  */
56 
57 #ifndef __LREZ_GZ_INDEX__
58 #define __LREZ_GZ_INDEX__
59 
60 using namespace std;
61 
62 #include <stdio.h>
63 #include <stdlib.h>
64 #include <string.h>
65 #include <zlib.h>
66 #include <iostream>
67 #include <fstream>
68 
69 #define local static
70 
71 #define SPAN 1048576L /* desired distance between access points */
72 #define WINSIZE 32768U /* sliding window size */
73 #define CHUNK 16384 /* file input buffer size */
74 
75 /* access point entry */
76 struct point {
77  off_t out; /* corresponding offset in uncompressed data */
78  off_t in; /* offset in input file of first full byte */
79  int bits; /* number of bits (1-7) from byte at in - 1, or 0 */
80  unsigned char window[WINSIZE]; /* preceding 32K of uncompressed data */
81 };
82 
83 /* access point list */
84 struct access {
85  int have; /* number of list entries filled in */
86  int size; /* number of list entries allocated */
87  long maxOffset; /* Last offset of the uncompressed file */
88  struct point *list; /* allocated list */
89 };
90 
95 void freeGzIndex(struct access *index);
96 
104 void serializeGzIndex(struct access* index, string outputFile);
105 
114 struct access* deserializeGzIndex(struct access* index, string inputFile);
115 
127 int buildGzIndex(string gzFile, off_t span, struct access **built);
128 
139 int buildGzIndex_Stream(FILE *in, off_t span, struct access **built);
140 
152 int extract(string gzFile, struct access *index, off_t offset, unsigned char *buf, int len);
153 
164 int extract_Stream(FILE *in, struct access *index, off_t offset, unsigned char *buf, int len);
165 
174 string extractFastqReadFromOffset(FILE* in, struct access* index, off_t offset);
175 
176 #endif
buildGzIndex
int buildGzIndex(string gzFile, off_t span, struct access **built)
access::have
int have
Definition: gzIndex.h:85
access::size
int size
Definition: gzIndex.h:86
WINSIZE
#define WINSIZE
Definition: gzIndex.h:72
freeGzIndex
void freeGzIndex(struct access *index)
access::list
struct point * list
Definition: gzIndex.h:88
extractFastqReadFromOffset
string extractFastqReadFromOffset(FILE *in, struct access *index, off_t offset)
point::in
off_t in
Definition: gzIndex.h:78
access::maxOffset
long maxOffset
Definition: gzIndex.h:87
point
Definition: gzIndex.h:76
serializeGzIndex
void serializeGzIndex(struct access *index, string outputFile)
buildGzIndex_Stream
int buildGzIndex_Stream(FILE *in, off_t span, struct access **built)
point::out
off_t out
Definition: gzIndex.h:77
point::bits
int bits
Definition: gzIndex.h:79
access
Definition: gzIndex.h:84
extract
int extract(string gzFile, struct access *index, off_t offset, unsigned char *buf, int len)
deserializeGzIndex
struct access * deserializeGzIndex(struct access *index, string inputFile)
extract_Stream
int extract_Stream(FILE *in, struct access *index, off_t offset, unsigned char *buf, int len)