Skip to content

Commit

Permalink
refactor: move finding files into separate class
Browse files Browse the repository at this point in the history
  • Loading branch information
RuedigerVoigt committed Jul 21, 2021
1 parent 7bc9de1 commit 0326aef
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 60 deletions.
6 changes: 4 additions & 2 deletions salted/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from salted import database_io
from salted import doi_check
from salted import err
from salted import file_finder
from salted import input_handler
from salted import memory_instance
from salted import url_check
Expand Down Expand Up @@ -154,11 +155,12 @@ def check(self,
logging.exception(msg)
raise FileNotFoundError(msg)

filesearch = file_finder.FileFinder()
file_io = input_handler.InputHandler(db)
files_to_check = list()
if path.is_dir():
logging.info('Base folder: %s', path)
files_to_check = file_io.find_files_by_extensions(path)
files_to_check = filesearch.find_files_by_extensions(path)
if files_to_check:
file_io.scan_files(files_to_check)
mem_instance.generate_indices()
Expand All @@ -168,7 +170,7 @@ def check(self,
logging.warning(
"No supported files in this folder or its subfolders.")
return
elif path.is_file() and file_io.is_supported_format(path):
elif path.is_file() and filesearch.is_supported_format(path):
files_to_check.append(path)
else:
msg = f"File format of {path} not supported"
Expand Down
72 changes: 72 additions & 0 deletions salted/file_finder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-

"""
Find Files for salted
~~~~~~~~~~~~~~~~~~~~~
Source: https://github.com/RuedigerVoigt/salted
(c) 2020-2021: Rüdiger Voigt
Released under the Apache License 2.0
"""

import logging
import pathlib
from typing import Final, List, Optional


class FileFinder:
"Methods to find files in supported formats"

SUPPORTED_SUFFIX: Final[set] = {".htm", ".html", '.md', '.tex', '.bib'}

def __init__(self) -> None:
return

def is_supported_format(self,
filepath: pathlib.Path) -> bool:
"Checks - using the filename suffix - if the file format is supported."
return bool(filepath.suffix in self.SUPPORTED_SUFFIX)

def find_files_by_extensions(
self,
path_to_base_folder: pathlib.Path,
suffixes: Optional[set] = None) -> List[pathlib.Path]:
"""Find all files with specific file type suffixes in the base folder
and its subfolders. If no file suffix is specified, this will look
for all file formats supported by salted."""
# self undefined at time of definition. Therefore fallback here:
if not suffixes:
suffixes = self.SUPPORTED_SUFFIX

files_to_check = []
path_to_check = pathlib.Path(path_to_base_folder)
all_files = path_to_check.glob('**/*')
for candidate in all_files:
if candidate.suffix in suffixes:
files_to_check.append(candidate.resolve())
logging.debug('Found %s files', len(files_to_check))
return files_to_check

def find_html_files(self,
path_to_base_folder: pathlib.Path
) -> List[pathlib.Path]:
"Find all HTML files in the base folder and its subfolders."
return self.find_files_by_extensions(
path_to_base_folder,
{".htm", ".html"})

def find_markdown_files(self,
path_to_base_folder: pathlib.Path
) -> List[pathlib.Path]:
"Find all markdown files in the base folder and its subfolders."
return self.find_files_by_extensions(
path_to_base_folder,
{".md"})

def find_tex_files(self,
path_to_base_folder: pathlib.Path
) -> List[pathlib.Path]:
"Find all .tex files in the base folder and its subfolders."
return self.find_files_by_extensions(
path_to_base_folder,
{".tex"})
55 changes: 2 additions & 53 deletions salted/input_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from collections import Counter
import logging
import pathlib
from typing import Final, List, Optional
from typing import List, Optional
import urllib.parse

import userprovided
Expand All @@ -23,65 +23,14 @@


class InputHandler:
"""Methods to find files and the hyperlinks inside them."""

SUPPORTED_SUFFIX: Final[set] = {".htm", ".html", '.md', '.tex', '.bib'}
"""read files and extract the hyperlinks inside them."""

def __init__(self,
db: database_io.DatabaseIO):
self.db = db
self.cnt: Counter = Counter()
self.parser = parser.Parser()

def is_supported_format(self,
filepath: pathlib.Path) -> bool:
"Checks - using the filename suffix - if the file format is supported."
return bool(filepath.suffix in self.SUPPORTED_SUFFIX)

def find_files_by_extensions(
self,
path_to_base_folder: pathlib.Path,
suffixes: Optional[set] = None) -> List[pathlib.Path]:
"""Find all files with specific file type suffixes in the base folder
and its subfolders. If no file suffix is specified, this will look
for all file formats supported by salted."""
# self undefined at time of definition. Therefore fallback here:
if not suffixes:
suffixes = self.SUPPORTED_SUFFIX

files_to_check = []
path_to_check = pathlib.Path(path_to_base_folder)
all_files = path_to_check.glob('**/*')
for candidate in all_files:
if candidate.suffix in suffixes:
files_to_check.append(candidate.resolve())
logging.debug('Found %s files', len(files_to_check))
return files_to_check

def find_html_files(self,
path_to_base_folder: pathlib.Path
) -> List[pathlib.Path]:
"Find all HTML files in the base folder and its subfolders."
return self.find_files_by_extensions(
path_to_base_folder,
{".htm", ".html"})

def find_markdown_files(self,
path_to_base_folder: pathlib.Path
) -> List[pathlib.Path]:
"Find all markdown files in the base folder and its subfolders."
return self.find_files_by_extensions(
path_to_base_folder,
{".md"})

def find_tex_files(self,
path_to_base_folder: pathlib.Path
) -> List[pathlib.Path]:
"Find all .tex files in the base folder and its subfolders."
return self.find_files_by_extensions(
path_to_base_folder,
{".tex"})

def read_file_content(self,
path_to_file: pathlib.Path) -> Optional[str]:
"Return the file content or log an error if file cannot be accessed."
Expand Down
10 changes: 5 additions & 5 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,14 +214,14 @@ def test_file_discovery(fs):
fs.create_file('/fake/fake/noextension')
fs.create_file('/fake/fake/foo.htmlandmore')
fs.create_file('/fake/fake/fake/foo.bib')
test_io = salted.input_handler.InputHandler(None)
supported_files = test_io.find_files_by_extensions('/fake')
filesearch = salted.file_finder.FileFinder()
supported_files = filesearch.find_files_by_extensions('/fake')
assert len(supported_files) == 7
html_files = test_io.find_html_files('/fake')
html_files = filesearch.find_html_files('/fake')
assert len(html_files) == 2
md_files = test_io.find_markdown_files('/fake')
md_files = filesearch.find_markdown_files('/fake')
assert len(md_files) == 2
tex_files = test_io.find_tex_files('/fake')
tex_files = filesearch.find_tex_files('/fake')
assert len(tex_files) == 2


Expand Down

0 comments on commit 0326aef

Please sign in to comment.