Skip to content

Commit

Permalink
Feature 779 compare pdf (#860)
Browse files Browse the repository at this point in the history
  • Loading branch information
georgemccabe authored Mar 30, 2021
1 parent 3eb23e0 commit 6809d8a
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 26 deletions.
2 changes: 1 addition & 1 deletion ci/actions/run_tests/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ fi

# install Pillow library needed for diff testing
# this will be replaced with better image diffing package used by METplotpy
pip_command="pip3 install Pillow"
pip_command="pip3 install Pillow; yum -y install poppler-utils; pip3 install pdf2image"

# build command to run
command="./ci/jobs/run_use_cases.py ${CATEGORIES} ${SUBSETLIST}"
Expand Down
17 changes: 13 additions & 4 deletions ci/jobs/run_use_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,16 @@ def copy_diff_output(diff_files):
and file path of output that was just generated. Either tuple
value may be an empty string if the file was not found.
"""
for truth_file, out_file, _ in diff_files:
for truth_file, out_file, _, diff_file in diff_files:
if truth_file:
copy_to_diff_dir(truth_file,
'truth')
if out_file:
copy_to_diff_dir(out_file,
'output')
if diff_file:
copy_to_diff_dir(diff_file,
'diff')

def copy_to_diff_dir(file_path, data_type):
"""! Generate output path based on input file path,
Expand All @@ -97,8 +100,12 @@ def copy_to_diff_dir(file_path, data_type):
diff_out = file_path.replace(data_dir, DIFF_DIR)

# add data type identifier to filename before extension
output_path, extension = os.path.splitext(diff_out)
output_path = f'{output_path}_{data_type}{extension}'
# if data is not difference output
if data_type == 'diff':
output_path = diff_out
else:
output_path, extension = os.path.splitext(diff_out)
output_path = f'{output_path}_{data_type}{extension}'

# create output directory if it doesn't exist
output_dir = os.path.dirname(output_path)
Expand Down Expand Up @@ -143,7 +150,9 @@ def main():
if compare and isOK:
print('******************************')
print("Comparing output to truth data")
diff_files = compare_dir(TRUTH_DIR, OUTPUT_DIR, debug=True)
diff_files = compare_dir(TRUTH_DIR, OUTPUT_DIR,
debug=True,
save_diff=True)
if diff_files:
isOK = False

Expand Down
119 changes: 98 additions & 21 deletions ci/util/diff_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@
'.zip',
]

UNSUPPORTED_EXTENSIONS = [
PDF_EXTENSIONS = [
'.pdf',
]

UNSUPPORTED_EXTENSIONS = [
]

def get_file_type(filepath):
_, file_extension = os.path.splitext(filepath)
if file_extension in IMAGE_EXTENSIONS:
Expand All @@ -43,15 +46,18 @@ def get_file_type(filepath):
if file_extension in SKIP_EXTENSIONS:
return 'skip'

if file_extension in PDF_EXTENSIONS:
return 'pdf'

if file_extension in UNSUPPORTED_EXTENSIONS:
return f'unsupported{file_extension}'

return 'unknown'

def compare_dir(dir_a, dir_b, debug=False):
# if input are files and not directories, compare them
def compare_dir(dir_a, dir_b, debug=False, save_diff=False):
# if input are files and not directories, compare them
if os.path.isfile(dir_a):
result = compare_files(dir_a, dir_b, debug=debug)
result = compare_files(dir_a, dir_b, debug=debug, save_diff=save_diff)
if result is None or result is True:
return []

Expand Down Expand Up @@ -83,7 +89,8 @@ def compare_dir(dir_a, dir_b, debug=False):
filepath_b,
debug=debug,
dir_a=dir_a,
dir_b=dir_b)
dir_b=dir_b,
save_diff=save_diff)

# no differences of skipped
if result is None or result is True:
Expand All @@ -101,22 +108,29 @@ def compare_dir(dir_a, dir_b, debug=False):
filepath_b = os.path.join(root, filename)
filepath_a = filepath_b.replace(dir_b, dir_a)
if not os.path.exists(filepath_a):
# check if missing file is actually diff file that was generated
diff_list = [item[3] for item in diff_files]
if filepath_b in diff_list:
continue
print(f"ERROR: File does not exist: {filepath_a}")
diff_files.append(('', filepath_b, 'file not found (new output)'))
diff_files.append(('', filepath_b, 'file not found (new output)', ''))

print("\nSummary:\n")
if diff_files:
print("\nERROR: Some differences were found")
for filepath_a, filepath_b, reason in diff_files:
for filepath_a, filepath_b, reason, diff_file in diff_files:
print(f"{reason}\n A:{filepath_a}\n B:{filepath_b}")
if diff_file:
print(f"Difference file: {diff_file}")
else:
print("\nNo differences found in any files")

print("Finished comparing directories\n"
"**************************************************\n\n")
return diff_files

def compare_files(filepath_a, filepath_b, debug=False, dir_a=None, dir_b=None):
def compare_files(filepath_a, filepath_b, debug=False, dir_a=None, dir_b=None,
save_diff=False):
# dir_a and dir_b are only needed if comparing file lists that need those
# directories to substitute when comparing because files in the list will
# have different paths
Expand All @@ -127,7 +141,7 @@ def compare_files(filepath_a, filepath_b, debug=False, dir_a=None, dir_b=None):
if not os.path.exists(filepath_b):
if debug:
print(f"ERROR: File does not exist: {filepath_b}")
return (filepath_a, '', 'file not found')
return (filepath_a, '', 'file not found', '')

file_type = get_file_type(filepath_a)
if file_type == 'skip':
Expand All @@ -136,31 +150,49 @@ def compare_files(filepath_a, filepath_b, debug=False, dir_a=None, dir_b=None):

if file_type.startswith('unsupported'):
print(f"Unsupported file type encountered: {file_type.split('.')[1]}")
return (filepath_a, filepath_b, file_type)
return (filepath_a, filepath_b, file_type, '')

if file_type == 'netcdf':
print("Comparing NetCDF")
if not nc_is_equal(filepath_a, filepath_b):
return (filepath_a, filepath_b, 'NetCDF diff')
return (filepath_a, filepath_b, 'NetCDF diff', '')

print("No differences in NetCDF files")
return True

if file_type == 'pdf':
print("Comparing PDF as images")
diff_file = compare_pdf_as_images(filepath_a, filepath_b,
save_diff=save_diff)
if diff_file is True:
print("No differences in PDF files")
return True

if diff_file is False:
diff_file = ''

return (filepath_a, filepath_b, 'PDF diff', diff_file)

if file_type == 'image':
print("Comparing images")
if not compare_image_files(filepath_a, filepath_b):
return (filepath_a, filepath_b, 'Image diff')
diff_file = compare_image_files(filepath_a, filepath_b,
save_diff=save_diff)
if diff_file is True:
print("No differences in image files")
return True

print("No differences in image files")
return True
if diff_file is False:
diff_file = ''

return (filepath_a, filepath_b, 'Image diff', diff_file)

# if not any of the above types, use diff to compare
print("Comparing text files")
if not filecmp.cmp(filepath_a, filepath_b):
# if files differ, open files and handle expected diffs
if not compare_txt_files(filepath_a, filepath_b, dir_a, dir_b):
print(f"ERROR: File differs: {filepath_b}")
return (filepath_a, filepath_b, 'Text diff')
return (filepath_a, filepath_b, 'Text diff', '')

print("No differences in text files")
return True
Expand All @@ -169,22 +201,67 @@ def compare_files(filepath_a, filepath_b, debug=False, dir_a=None, dir_b=None):

return True

def compare_image_files(filepath_a, filepath_b):
diff_count = 0
def compare_pdf_as_images(filepath_a, filepath_b, save_diff=False):
try:
from pdf2image import convert_from_path
except ModuleNotFoundError:
print("Cannot compare PDF files without pdf2image Python package")
return False

images_a = convert_from_path(filepath_a)
images_b = convert_from_path(filepath_b)
for image_a, image_b in zip(images_a, images_b):
image_diff = compare_images(image_a, image_b)

# no differences if None, so continue to next image from PDF
if image_diff is None:
continue

# if skipping save diff files, return False b/c there are differences
if not save_diff:
return False

# create difference image and return the path
return save_diff_file(image_diff, filepath_b)

return True

def compare_image_files(filepath_a, filepath_b, save_diff=False):
image_a = Image.open(filepath_a)
image_b = Image.open(filepath_b)
image_diff = compare_images(image_a, image_b)
if image_diff is None:
return True

if not save_diff:
return False

return save_diff_file(image_diff, filepath_b)

def compare_images(image_a, image_b):
"""! Compare pillow image objects. Returns difference image object if there
are differences or None if not.
"""
diff_count = 0
image_diff = ImageChops.difference(image_a, image_b)
nx, ny = image_diff.size
for x in range(0, int(nx)):
for y in range(0, int(ny)):
pixel = image_diff.getpixel((x, y))
if pixel != 0 and pixel != (0, 0, 0, 0):
if pixel != 0 and pixel != (0, 0, 0, 0) and pixel != (0, 0, 0):
print(f"Difference pixel: {pixel}")
diff_count += 1
if diff_count:
print(f"ERROR: Found {diff_count} differences between images")
return False
return True
return image_diff
return None

def save_diff_file(image_diff, filepath_b):
rel_path, file_extension = os.path.splitext(filepath_b)
diff_file = f'{rel_path}_diff.png'
print(f"Saving diff file: {diff_file}")
image_diff.save(diff_file, "PNG")
return diff_file

def compare_txt_files(filepath_a, filepath_b, dir_a=None, dir_b=None):
with open(filepath_a, 'r') as file_handle:
Expand Down

0 comments on commit 6809d8a

Please sign in to comment.