From 584bda2614e20213ca55b58981f75f5164d6ecfa Mon Sep 17 00:00:00 2001
From: galkahana <gal.kahana@hotmail.com>
Date: Sat, 18 Feb 2023 21:25:36 +0200
Subject: [PATCH] fix: harsher hex string parsing to avoid more dire parsing
 errors

running fuzztests on my mac was failing erraticaly for one of the files.

also recreating a decompressed version didnt work.

traced the first problem to a too permissive hex string parsing which caused overreads.

and the second problem to a too restrictive zip decoding issue where end of stream error canceled the whole stream reading. which is a shame cause it could parse something worthwhile till then, like in this case.
---
 CMakeLists.txt                       |  2 +-
 PDFWriter/InputFlateDecodeStream.cpp | 10 ++++------
 PDFWriter/PDFParserTokenizer.cpp     | 17 +++++++++++++++--
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b155c21..73044646 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.15)
 # Soname
 # MAJOR is incremented when symbols are removed or changed in an incompatible way
 # MINOR is incremented when new symbols are added
-project(PDFHummus VERSION 4.5.1)
+project(PDFHummus VERSION 4.5.2)
 
 
 option(USE_BUNDLED  "Whether to use bundled libraries" TRUE)
diff --git a/PDFWriter/InputFlateDecodeStream.cpp b/PDFWriter/InputFlateDecodeStream.cpp
index 7759011b..827561f3 100644
--- a/PDFWriter/InputFlateDecodeStream.cpp
+++ b/PDFWriter/InputFlateDecodeStream.cpp
@@ -106,8 +106,8 @@ IOBasicTypes::LongBufferSizeType InputFlateDecodeStream::Read(IOBasicTypes::Byte
 
 IOBasicTypes::LongBufferSizeType InputFlateDecodeStream::DecodeBufferAndRead(const IOBasicTypes::Byte* inBuffer,IOBasicTypes::LongBufferSizeType inSize)
 {
-	if(0 == inSize)
-		return 0; // inflate kinda touchy about getting 0 lengths
+	if(0 == inSize || mEndOfCompressionEoncountered)
+		return 0; // inflate kinda touchy about getting 0 lengths, also stop if already decided to finish
 
 	int inflateResult = Z_OK;
 
@@ -168,10 +168,8 @@ IOBasicTypes::LongBufferSizeType InputFlateDecodeStream::DecodeBufferAndRead(con
 
 	// should be that at the last buffer we'll get here a nice Z_STREAM_END
 	mEndOfCompressionEoncountered = (Z_STREAM_END == inflateResult) || isError(inflateResult);
-	if(Z_OK == inflateResult || Z_STREAM_END == inflateResult)
-		return inSize - mZLibState->avail_out;
-	else
-		return 0;
+	// allows returning what's read even if finished with error. a forgiving approach for end of stream errors
+	return inSize - mZLibState->avail_out;
 }
 
 bool InputFlateDecodeStream::NotEnded()
diff --git a/PDFWriter/PDFParserTokenizer.cpp b/PDFWriter/PDFParserTokenizer.cpp
index 0bb75a11..44f516dc 100644
--- a/PDFWriter/PDFParserTokenizer.cpp
+++ b/PDFWriter/PDFParserTokenizer.cpp
@@ -21,6 +21,7 @@
 #include "PDFParserTokenizer.h"
 #include "IByteReader.h"
 #include "OutputStringBufferStream.h"
+#include "Trace.h"
 
 using namespace PDFHummus;
 using namespace IOBasicTypes;
@@ -212,8 +213,20 @@ BoolAndString PDFParserTokenizer::GetNextToken()
 							break;
 						}
 
-						if(!IsPDFWhiteSpace(buffer))
-							tokenBuffer.Write(&buffer,1);
+						if(IsPDFWhiteSpace(buffer))
+							continue;
+
+						// verify that getting a hex char or ending char, so that we are sure there's no orcish mischief
+						if(!(buffer>='0' && buffer<='9') &&
+						   !(buffer>='A' && buffer<='F') &&
+						   !(buffer>='a' && buffer<='f') &&
+						   buffer != '>') {
+							TRACE_LOG1("PDFParserTokenizer::GetNextToken, encountered non ascii char in hex string. probably a corruption. byte value = %d", buffer);
+							result.first = false;
+							break;
+						}
+
+						tokenBuffer.Write(&buffer,1);
 					}
 				}
 				result.second = tokenBuffer.ToString();