Skip to content

Commit

Permalink
Efficient multi-image tiff reading.
Browse files Browse the repository at this point in the history
* This now resolves longstanding need for linear performance when
  reading multi-image TIFF files.  For example, tesseract should be able
  to store a million small images in a file and extract them efficiently.
  See, e.g., tesseract-ocr/tesseract#233
  Thanks to Jeff Breidenbach for figuring out how to do this in a
  general way without exposing TIFF internals to the client.
  • Loading branch information
DanBloomberg committed Sep 10, 2016
1 parent becbf9c commit c65f0d2
Show file tree
Hide file tree
Showing 4 changed files with 263 additions and 27 deletions.
70 changes: 68 additions & 2 deletions prog/mtifftest.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,12 @@ static const char *weasel_orig = "/tmp/lept/tiff/weasel_orig";
int main(int argc,
char **argv)
{
l_uint8 *data;
char *fname, *filename;
const char *str;
char buffer[512];
l_int32 i, npages;
size_t length;
l_int32 i, n, npages;
size_t length, offset, size;
FILE *fp;
NUMA *naflags, *nasizes;
PIX *pix, *pix1, *pix2, *pixd;
Expand Down Expand Up @@ -78,6 +79,71 @@ static char mainName[] = "mtifftest";
pixDisplay(pixd, 100, 400);
pixDestroy(&pixd);
pixaDestroy(&pixa);

/* This uses the offset method for linearizing overhead of
* reading from a multi-image tiff file. */
offset = 0;
n = 0;
pixa = pixaCreate(8);
do {
pix1 = pixReadFromMultipageTiff("/tmp/lept/tiff/weasel8.tif", &offset);
if (!pix1) continue;
pixaAddPix(pixa, pix1, L_INSERT);
fprintf(stderr, "offset = %ld\n", offset);
n++;
} while (offset != 0);
fprintf(stderr, "Num images = %d\n", n);
pixd = pixaDisplayTiledInRows(pixa, 32, 1200, 1.2, 0, 15, 4);
pixDisplay(pixd, 100, 550);
pixDestroy(&pixd);
pixaDestroy(&pixa);

/* This uses the offset method for linearizing overhead of
* reading from a multi-image tiff file in memory. */
offset = 0;
n = 0;
pixa = pixaCreate(8);
data = l_binaryRead("/tmp/lept/tiff/weasel8.tif", &size);
do {
pix1 = pixReadMemFromMultipageTiff(data, size, &offset);
if (!pix1) continue;
pixaAddPix(pixa, pix1, L_INSERT);
fprintf(stderr, "offset = %ld\n", offset);
n++;
} while (offset != 0);
fprintf(stderr, "Num images = %d\n", n);
pixd = pixaDisplayTiledInRows(pixa, 32, 1200, 1.2, 0, 15, 4);
pixDisplay(pixd, 100, 700);
pixDestroy(&pixd);
pixaDestroy(&pixa);
lept_free(data);

/* This makes a 1001 image tiff file and gives timing
* for writing and reading. Reading uses the offset method
* and the time is linear in the number of images, but the
* writing time is quadratic and the actual wall clock time is
* significantly more than the printed value. */
pix1 = pixRead("char.tif");
startTimer();
pixWriteTiff("/tmp/lept/tiff/junkm.tif", pix1, IFF_TIFF_G4, "w");
for (i = 0; i < 1000; i++) {
pixWriteTiff("/tmp/lept/tiff/junkm.tif", pix1, IFF_TIFF_G4, "a");
}
pixDestroy(&pix1);
fprintf(stderr, "Time to write: %7.3f\n", stopTimer());
startTimer();
offset = 0;
n = 0;
do {
pix1 = pixReadFromMultipageTiff("/tmp/lept/tiff/junkm.tif", &offset);
if (!pix1) continue;
if (n % 100 == 0)
fprintf(stderr, "offset = %ld\n", offset);
pixDestroy(&pix1);
n++;
} while (offset != 0);
fprintf(stderr, "Time to read: %7.3f\n", stopTimer());
fprintf(stderr, "Num images = %d\n", n);
#endif

#if 1 /* ------------ Test single-to-multipage I/O -------------------*/
Expand Down
2 changes: 2 additions & 0 deletions src/allheaders.h
Original file line number Diff line number Diff line change
Expand Up @@ -2549,6 +2549,7 @@ LEPT_DLL extern PIX * pixReadStreamTiff ( FILE *fp, l_int32 n );
LEPT_DLL extern l_int32 pixWriteTiff ( const char *filename, PIX *pix, l_int32 comptype, const char *modestring );
LEPT_DLL extern l_int32 pixWriteTiffCustom ( const char *filename, PIX *pix, l_int32 comptype, const char *modestring, NUMA *natags, SARRAY *savals, SARRAY *satypes, NUMA *nasizes );
LEPT_DLL extern l_int32 pixWriteStreamTiff ( FILE *fp, PIX *pix, l_int32 comptype );
LEPT_DLL extern PIX * pixReadFromMultipageTiff ( const char *fname, size_t *poffset );
LEPT_DLL extern PIXA * pixaReadMultipageTiff ( const char *filename );
LEPT_DLL extern l_int32 writeMultipageTiff ( const char *dirin, const char *substr, const char *fileout );
LEPT_DLL extern l_int32 writeMultipageTiffSA ( SARRAY *sa, const char *fileout );
Expand All @@ -2561,6 +2562,7 @@ LEPT_DLL extern l_int32 readHeaderMemTiff ( const l_uint8 *cdata, size_t size, l
LEPT_DLL extern l_int32 findTiffCompression ( FILE *fp, l_int32 *pcomptype );
LEPT_DLL extern l_int32 extractG4DataFromFile ( const char *filein, l_uint8 **pdata, size_t *pnbytes, l_int32 *pw, l_int32 *ph, l_int32 *pminisblack );
LEPT_DLL extern PIX * pixReadMemTiff ( const l_uint8 *cdata, size_t size, l_int32 n );
LEPT_DLL extern PIX * pixReadMemFromMultipageTiff ( const l_uint8 *cdata, size_t size, size_t *poffset );
LEPT_DLL extern l_int32 pixWriteMemTiff ( l_uint8 **pdata, size_t *psize, PIX *pix, l_int32 comptype );
LEPT_DLL extern l_int32 pixWriteMemTiffCustom ( l_uint8 **pdata, size_t *psize, PIX *pix, l_int32 comptype, NUMA *natags, SARRAY *savals, SARRAY *satypes, NUMA *nasizes );
LEPT_DLL extern l_int32 setMsgSeverity ( l_int32 newsev );
Expand Down
186 changes: 165 additions & 21 deletions src/tiffio.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@
* static l_int32 writeCustomTiffTags()
*
* Reading and writing multipage tiff
* PIXA pixaReadMultipageTiff()
* PIX *pixReadFromMultipageTiff()
* PIXA *pixaReadMultipageTiff()
* l_int32 writeMultipageTiff() [ special top level ]
* l_int32 writeMultipageTiffSA()
*
Expand All @@ -73,9 +74,10 @@
* Wrapper for TIFFOpen:
* static TIFF *openTiff()
*
* Memory I/O: reading memory --\> pix and writing pix --\> memory
* Memory I/O: reading memory --> pix and writing pix --> memory
* [10 static helper functions]
* l_int32 pixReadMemTiff();
* PIX *pixReadMemTiff();
* PIX *pixReadMemFromMultipageTiff();
* l_int32 pixWriteMemTiff();
* l_int32 pixWriteMemTiffCustom();
*
Expand Down Expand Up @@ -380,7 +382,7 @@ PIX *pix;
*
* \param[in] fp file stream
* \param[in] n page number: 0 based
* \return pix, or NULL on error e.g., if the page number is invalid
* \return pix, or NULL on error or if there are no more images in the file
*
* <pre>
* Notes:
Expand All @@ -405,20 +407,14 @@ TIFF *tif;
if ((tif = fopenTiff(fp, "r")) == NULL)
return (PIX *)ERROR_PTR("tif not opened", procName, NULL);

pix = NULL;
for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) {
TIFFSetDirectory(tif, i);
if (i == n) {
if ((pix = pixReadFromTiffStream(tif)) == NULL) {
TIFFCleanup(tif);
return NULL;
}
break;
}
if (TIFFReadDirectory(tif) == 0)
break;
if (TIFFSetDirectory(tif, n) == 0) {
TIFFCleanup(tif);
return NULL;
}
if ((pix = pixReadFromTiffStream(tif)) == NULL) {
TIFFCleanup(tif);
return NULL;
}

TIFFCleanup(tif);
return pix;
}
Expand Down Expand Up @@ -620,6 +616,7 @@ PIXCMAP *cmap;
}



/*--------------------------------------------------------------*
* Writing to file *
*--------------------------------------------------------------*/
Expand All @@ -636,8 +633,12 @@ PIXCMAP *cmap;
*
* <pre>
* Notes:
* (1) For multi-page tiff, write the first pix with mode "w" and
* (1) For multipage tiff, write the first pix with mode "w" and
* all subsequent pix with mode "a".
* (2) For multipage tiff, there is considerable overhead in the
* machinery to append an image and add the directory entry,
* and the time required for each image increases linearly
* with the number of images in the file.
* </pre>
*/
l_int32
Expand Down Expand Up @@ -669,7 +670,7 @@ pixWriteTiff(const char *filename,
* Usage:
* 1 This writes a page image to a tiff file, with optional
* extra tags defined in tiff.h
* 2 For multi-page tiff, write the first pix with mode "w" and
* 2 For multipage tiff, write the first pix with mode "w" and
* all subsequent pix with mode "a".
* 3 For the custom tiff tags:
* a The three arrays {natags, savals, satypes} must all be
Expand Down Expand Up @@ -1107,6 +1108,80 @@ l_uint32 uval, uval2;
/*--------------------------------------------------------------*
* Reading and writing multipage tiff *
*--------------------------------------------------------------*/
/*!
* \brief pixReadFromMultipageTiff()
*
* \param[in] fname filename
* \param[in,out] &offset set offset to 0 for first image
* \return pix, or NULL on error or if previous call returned the last image
*
* <pre>
* Notes:
* (1) This allows overhead for traversal of a multipage tiff file
* to be linear in the number of images. This will also work
* with a singlepage tiff file.
* (2) No TIFF internal data structures are exposed to the caller
* (thanks to Jeff Breidenbach).
* (3) offset is the byte offset of a particular image in a multipage
* tiff file. To get the first image in the file, input the
* special offset value of 0.
* (4) The offset is updated to point to the next image, for a
* subsequent call.
* (5) On the last image, the offset returned is 0. Exit the loop
* when the returned offset is 0.
* (6) For reading a multipage tiff from a memory buffer, see
* pixReadMemFromMultipageTiff()
* (7) Example usage for reading all the images in the tif file:
* size_t offset = 0;
* do {
* Pix *pix = pixReadFromMultipageTiff(filename, &offset);
* // do something with pix
* } while (offset != 0);
* </pre>
*/
PIX *
pixReadFromMultipageTiff(const char *fname,
size_t *poffset)
{
l_int32 retval;
size_t offset;
PIX *pix;
TIFF *tif;

PROCNAME("pixReadFromMultipageTiff");

if (!fname)
return (PIX *)ERROR_PTR("fname not defined", procName, NULL);
if (!poffset)
return (PIX *)ERROR_PTR("&offset not defined", procName, NULL);

if ((tif = TIFFOpen(fname, "r")) == NULL) {
L_ERROR("tif open failed for %s\n", procName, fname);
return NULL;
}

/* Set ptrs in the TIFF to the beginning of the image */
offset = *poffset;
retval = (offset == 0) ? TIFFSetDirectory(tif, 0)
: TIFFSetSubDirectory(tif, offset);
if (retval == 0) {
TIFFCleanup(tif);
return NULL;
}

if ((pix = pixReadFromTiffStream(tif)) == NULL) {
TIFFCleanup(tif);
return NULL;
}

/* Advance to the next image and return the new offset */
TIFFReadDirectory(tif);
*poffset = TIFFCurrentDirOffset(tif);
TIFFClose(tif);
return pix;
}


/*!
* \brief pixaReadMultipageTiff()
*
Expand Down Expand Up @@ -2188,8 +2263,8 @@ tiffUnmapCallback(thandle_t handle,
* <pre>
* Notes:
* (1) This wraps up a number of callbacks for either:
* * reading from tiff in memory buffer --\> pix
* * writing from pix --\> tiff in memory buffer
* * reading from tiff in memory buffer --> pix
* * writing from pix --> tiff in memory buffer
* (2) After use, the memstream is automatically destroyed when
* TIFFClose() is called. TIFFCleanup() doesn't free the memstream.
* </pre>
Expand Down Expand Up @@ -2244,6 +2319,8 @@ L_MEMSTREAM *mstream;
* (3) No warning messages on failure, because of how multi-page
* TIFF reading works. You are supposed to keep trying until
* it stops working.
* (4) Tiff directory overhead is linear in the input page number.
* If reading many images, use pixReadMemFromMultipageTiff().
* </pre>
*/
PIX *
Expand Down Expand Up @@ -2284,6 +2361,73 @@ TIFF *tif;
}


/*!
* \brief pixReadMemFromMultipageTiff()
*
* \param[in] cdata const; tiff-encoded
* \param[in] size size of cdata
* \param[in,out] &offset set offset to 0 for first image
* \return pix, or NULL on error or if previous call returned the last image
*
* <pre>
* Notes:
* (1) This is a read-from-memory version of pixReadFromMultipageTiff().
* See that function for usage.
* (2) If reading sequentially from the tiff data, this is more
* efficient than pixReadMemTiff(), which has an overhead
* proportional to the image index n.
* (3) Example usage for reading all the images:
* size_t offset = 0;
* do {
* Pix *pix = pixReadMemFromMultipageTiff(data, size, &offset);
* // do something with pix
* } while (offset != 0);
* </pre>
*/
PIX *
pixReadMemFromMultipageTiff(const l_uint8 *cdata,
size_t size,
size_t *poffset)
{
l_uint8 *data;
l_int32 retval;
size_t offset;
PIX *pix;
TIFF *tif;

PROCNAME("pixReadMemFromMultipageTiff");

if (!cdata)
return (PIX *)ERROR_PTR("cdata not defined", procName, NULL);
if (!poffset)
return (PIX *)ERROR_PTR("&offset not defined", procName, NULL);

data = (l_uint8 *)cdata; /* we're really not going to change this */
if ((tif = fopenTiffMemstream("tifferror", "r", &data, &size)) == NULL)
return (PIX *)ERROR_PTR("tiff stream not opened", procName, NULL);

/* Set ptrs in the TIFF to the beginning of the image */
offset = *poffset;
retval = (offset == 0) ? TIFFSetDirectory(tif, 0)
: TIFFSetSubDirectory(tif, offset);
if (retval == 0) {
TIFFClose(tif);
return NULL;
}

if ((pix = pixReadFromTiffStream(tif)) == NULL) {
TIFFClose(tif);
return NULL;
}

/* Advance to the next image and return the new offset */
TIFFReadDirectory(tif);
*poffset = TIFFCurrentDirOffset(tif);
TIFFClose(tif);
return pix;
}


/*!
* \brief pixWriteMemTiff()
*
Expand Down
Loading

0 comments on commit c65f0d2

Please sign in to comment.