Skip to content

Commit

Permalink
fix(label_split_data): add suffle and clean extraction (#30)
Browse files Browse the repository at this point in the history
  • Loading branch information
guillaume-chervet authored Jan 13, 2024
1 parent 2d88557 commit 99ba957
Show file tree
Hide file tree
Showing 13 changed files with 324 additions and 123 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -356,4 +356,4 @@ jobs:
./bin/generate-changelog.sh "$PROJECT_URL"
git add ./CHANGELOG.md
git commit -m "[skip ci] Generate changelog to version ${{ steps.tag.outputs.new_version }}"
git push --set-upstream origin "HEAD:main" --follow-tags -f
git push -f --set-upstream origin "HEAD:main" --follow-tags
68 changes: 51 additions & 17 deletions packages/extraction/mlopspython_extraction/extraction.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import io
from abc import abstractmethod, ABC
from dataclasses import dataclass
from io import BytesIO
from pathlib import Path
from typing import Iterable
from typing import Iterable, List

import fitz
from fitz import Pixmap


def convert_pixmap_to_rgb(pixmap) -> Pixmap:
"""Convert to rgb in order to write on png"""
"""Convert to rgb in order to write on png"""
# check if it is already on rgb
if pixmap.n < 4:
return pixmap
Expand Down Expand Up @@ -41,18 +43,50 @@ class ExtractImagesResult:
number_images_output: int


def extract_images(pdfs_directory_path: str, images_directory_path: str) -> ExtractImagesResult:
pdfs = [p for p in Path(pdfs_directory_path).iterdir() if p.is_file() and p.suffix == ".pdf"]
Path(images_directory_path).mkdir(parents=True, exist_ok=True)
number_images_output = 0
for pdf_path in pdfs:
with open(pdf_path, "rb") as pdf_stream:
pdf_bytes = pdf_stream.read()
for image_stream in extract_images_stream(pdf_bytes):
filename = "{0}_page{1}_index{2}.png".format(pdf_path.stem, str(image_stream.index_page),
str(image_stream.index_image))
number_images_output = number_images_output + 1
with open(Path(images_directory_path) / filename, "wb") as file_stream:
file_stream.write(image_stream.image_bytes_io.getbuffer())

return ExtractImagesResult(number_files_input=len(pdfs), number_images_output=number_images_output)
class IDataManager(ABC):
@abstractmethod
def get_pdf_files(self, pdfs_directory_path: str) -> List[Path]:
pass

@abstractmethod
def save_image(self, image_stream: ImageResult, images_directory_path: str) -> None:
pass

@abstractmethod
def create_directory(self, directory_path: str) -> None:
pass


class DataManager(IDataManager):
def get_pdf_files(self, pdfs_directory_path: str) -> List[Path]:
pdfs = [p for p in Path(pdfs_directory_path).iterdir() if p.is_file() and p.suffix == ".pdf"]
pdfs.sort()
return pdfs

def save_image(self, image_bytes_io: io.BytesIO, image_path: str) -> None:
with open(image_path, "wb") as file_stream:
file_stream.write(image_bytes_io.getbuffer())

def create_directory(self, directory_path: str) -> None:
Path(directory_path).mkdir(parents=True, exist_ok=True)


class ExtractImages:
def __init__(self, data_manager: IDataManager):
self.data_manager = data_manager

def extract_images(self, pdfs_directory_path: str, images_directory_path: str) -> ExtractImagesResult:
manager = self.data_manager
pdfs = manager.get_pdf_files(pdfs_directory_path)
manager.create_directory(images_directory_path)
number_images_output = 0
for pdf_path in pdfs:
with open(pdf_path, "rb") as pdf_stream:
pdf_bytes = pdf_stream.read()
for image_stream in extract_images_stream(pdf_bytes):
filename = "{0}_page{1}_index{2}.png".format(pdf_path.stem, str(image_stream.index_page),
str(image_stream.index_image))
number_images_output = number_images_output + 1
manager.save_image(image_stream.image_bytes_io, str(Path(images_directory_path) / filename))

return ExtractImagesResult(number_files_input=len(pdfs), number_images_output=number_images_output)
30 changes: 20 additions & 10 deletions packages/extraction/tests/extraction_tests.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,33 @@
import shutil
import unittest
from pathlib import Path
from unittest.mock import Mock

from mlopspython_extraction.extraction import extract_images
from mlopspython_extraction.extraction import ExtractImages, IDataManager, DataManager

BASE_PATH = Path(__file__).resolve().parent
output_directory = BASE_PATH / "output"
input_directory = BASE_PATH / "input"

class TestExtraction(unittest.TestCase):

class TestExtraction(unittest.TestCase):
def test_pdfs_images_should_be_extracted(self):
if output_directory.is_dir():
shutil.rmtree(str(output_directory))
result = extract_images(str(input_directory), str(output_directory))
expected_number_files_input = 2
self.assertEqual(result.number_files_input, expected_number_files_input)
expected_number_images_output = 3
self.assertEqual(result.number_images_output, expected_number_images_output)
data_manager_mock = Mock(IDataManager)
data_manager_mock.get_pdf_files = DataManager().get_pdf_files
data_manager_mock.save_image = Mock()
data_manager_mock.create_directory = Mock()

extract_images = ExtractImages(data_manager_mock)

input_directory_str = str(input_directory)
output_directory_str = str(output_directory)
extract_images.extract_images(input_directory_str, output_directory_str)

data_manager_mock.create_directory.assert_called_once_with(output_directory_str)

save_image_call = data_manager_mock.save_image.call_args_list
self.assertTrue(save_image_call[0].args[1].endswith("a_page0_index0.png"))
self.assertTrue(save_image_call[1].args[1].endswith("a_page1_index0.png"))
self.assertTrue(save_image_call[2].args[1].endswith("b_page0_index0.png"))


if __name__ == "__main__":
Expand Down
File renamed without changes.
File renamed without changes.
7 changes: 4 additions & 3 deletions train/extraction/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from pathlib import Path

from directory_hash import hash_dir
from mlopspython_extraction.extraction import extract_images
from mlopspython_extraction.extraction import ExtractImages, DataManager
import mlflow

parser = argparse.ArgumentParser("extraction")
Expand All @@ -15,7 +15,8 @@
images_output = args.images_output
hash_output = args.hash_output

result = extract_images(pdfs_input, images_output)
extract_images = ExtractImages(DataManager())
result = extract_images.extract_images(pdfs_input, images_output)

computed_hash = hash_dir(images_output)
with open(str(Path(hash_output) / "hash.txt"), "w") as file:
Expand All @@ -26,6 +27,6 @@
number_files_input: {result.number_files_input}
number_images_output: {result.number_images_output}
computed_hash: {computed_hash}"""

mlflow.log_metric("number_files_input", result.number_files_input)
mlflow.log_metric("number_images_output", result.number_images_output)
10 changes: 8 additions & 2 deletions train/label_split_data/command.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse
from pathlib import Path
import mlflow
from label_split_data import label_split_data
from label_split_data import DataSplit, LabelSplitDataInput

parser = argparse.ArgumentParser("label_split_data")
parser.add_argument("--labels_input", type=str)
Expand All @@ -27,7 +27,8 @@

mlflow.log_params(params)
labels_files_path = Path(labels_input) / "cats_dogs_others_classification-annotations.json"
label_split_data_result = label_split_data(

label_split_data_input = LabelSplitDataInput(
labels_files_path,
Path(images_input),
Path(pdfs_input),
Expand All @@ -38,6 +39,11 @@
ratio_train,
ratio_test,
)

data_split = DataSplit()
label_split_data_result = data_split.label_split_data(
label_split_data_input
)
mlflow.log_metric("number_file_train_by_label", label_split_data_result.number_file_train_by_label)
mlflow.log_metric("number_file_test_by_label", label_split_data_result.number_file_test_by_label)
mlflow.log_metric("number_file_evaluate_by_label", label_split_data_result.number_file_evaluate_by_label)
Loading

0 comments on commit 99ba957

Please sign in to comment.