From 8929bca83edfaf057affb05f55f0814853248bc0 Mon Sep 17 00:00:00 2001 From: mikeqfu Date: Tue, 8 Feb 2022 21:42:46 +0000 Subject: [PATCH] Modify `convert_md_to_rst` and docstrings of some functions --- pyhelpers/text.py | 72 ++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/pyhelpers/text.py b/pyhelpers/text.py index 7d80ecd..28d995b 100644 --- a/pyhelpers/text.py +++ b/pyhelpers/text.py @@ -14,8 +14,8 @@ import numpy as np import pandas as pd -from pyhelpers._cache import _ENGLISH_WRITTEN_NUMBERS -from pyhelpers.ops import dict_to_dataframe +from ._cache import _ENGLISH_WRITTEN_NUMBERS +from .ops import dict_to_dataframe, find_executable """ == Basic processing of textual data ====================================================== """ @@ -285,17 +285,17 @@ def find_similar_str(x, lookup_list, n=1, ignore_punctuation=True, method='diffl ... 'Wessex', ... 'Western'] - >>> str_similar = find_similar_str('angle', lookup_lst) + >>> str_similar = find_similar_str(x='angle', lookup_list=lookup_lst) >>> str_similar 'Anglia' - >>> str_similar = find_similar_str('angle', lookup_lst, processor='fuzzywuzzy') + >>> str_similar = find_similar_str(x='angle', lookup_list=lookup_lst, method='fuzzywuzzy') >>> str_similar 'Anglia' - >>> str_similar = find_similar_str('x', lookup_lst) + >>> str_similar = find_similar_str(x='x', lookup_list=lookup_lst) >>> str_similar # None - >>> str_similar = find_similar_str('x', lookup_lst, processor='fuzzywuzzy') + >>> str_similar = find_similar_str(x='x', lookup_list=lookup_lst, method='fuzzywuzzy') >>> str_similar 'Wessex' """ @@ -307,7 +307,7 @@ def find_similar_str(x, lookup_list, n=1, ignore_punctuation=True, method='diffl if method in {'difflib', None}: x_ = x.lower() - lookup_dict = {y.lower(): y for y in set(lookup_list)} + lookup_dict = {y.lower(): y for y in lookup_list} if ignore_punctuation: x_ = remove_punctuation(x_) @@ -323,7 +323,7 @@ def find_similar_str(x, lookup_list, n=1, ignore_punctuation=True, method='diffl elif method == 'fuzzywuzzy': import fuzzywuzzy.fuzz - l_distances = [fuzzywuzzy.fuzz.token_set_ratio(s1=x, s2=a, **kwargs) for a in set(lookup_list)] + l_distances = [fuzzywuzzy.fuzz.token_set_ratio(s1=x, s2=a, **kwargs) for a in lookup_list] if sum(l_distances) == 0: sim_str = None @@ -408,10 +408,12 @@ def calculate_idf(raw_documents, rm_punc=False): >>> from pyhelpers.text import calculate_idf >>> import pandas - >>> raw_doc = pandas.Series(['This is an apple.', - ... 'That is a pear.', - ... 'It is human being.', - ... 'Hello world!']) + >>> raw_doc_ = [ + ... 'This is an apple.', + ... 'That is a pear.', + ... 'It is human being.', + ... 'Hello world!'] + >>> raw_doc = pandas.Series(raw_doc_) >>> docs_tf_, corpus_idf_ = calculate_idf(raw_doc, rm_punc=False) >>> docs_tf_ @@ -495,10 +497,12 @@ def calculate_tf_idf(raw_documents, rm_punc=False): >>> from pyhelpers.text import calculate_tf_idf >>> import pandas - >>> raw_doc = pandas.Series(['This is an apple.', - ... 'That is a pear.', - ... 'It is human being.', - ... 'Hello world!']) + >>> raw_doc_ = [ + ... 'This is an apple.', + ... 'That is a pear.', + ... 'It is human being.', + ... 'Hello world!'] + >>> raw_doc = pandas.Series(raw_doc_) >>> docs_tf_idf_ = calculate_tf_idf(raw_doc, rm_punc=False) >>> docs_tf_idf_ @@ -617,7 +621,7 @@ def cosine_similarity_between_texts(txt1, txt2, cosine_distance=False): return cos_similarity -""" == Transformation of textual data ======================================================== """ +""" == Conversion of text files ============================================================== """ def convert_md_to_rst(path_to_md, path_to_rst, verbose=False, pandoc_exe=None, **kwargs): @@ -646,10 +650,10 @@ def convert_md_to_rst(path_to_md, path_to_rst, verbose=False, pandoc_exe=None, * >>> from pyhelpers.text import convert_md_to_rst >>> from pyhelpers.dir import cd - >>> dat_dir = cd("tests\\data") + >>> dat_dir = cd("tests\\documents") - >>> path_to_md_file = cd(dat_dir, "markdown.md") - >>> path_to_rst_file = cd(dat_dir, "markdown.rst") + >>> path_to_md_file = cd(dat_dir, "readme.md") + >>> path_to_rst_file = cd(dat_dir, "readme.rst") >>> convert_md_to_rst(path_to_md_file, path_to_rst_file, verbose=True) Converting "tests\\data\\markdown.md" to "tests\\data\\markdown.rst" ... Done. @@ -659,30 +663,34 @@ def convert_md_to_rst(path_to_md, path_to_rst, verbose=False, pandoc_exe=None, * # assert abs_md_path.suffix == ".md" and abs_rst_path.suffix == ".rst" if verbose: - rel_md_path = pathlib.Path(os.path.relpath(abs_md_path)) - rel_rst_path = pathlib.Path(os.path.relpath(abs_rst_path)) + rel_md_path, rel_rst_path = map( + lambda x: pathlib.Path(os.path.relpath(x)), (abs_md_path, abs_rst_path)) + if not os.path.exists(abs_rst_path): - print("Converting \"{}\" to \"{}\"".format(rel_md_path, rel_rst_path), end=" ... ") + msg = "Converting \"{}\" to \"{}\"".format(rel_md_path, rel_rst_path) else: - print("Updating \"{}\" at \"{}\\\"".format(rel_rst_path.name, rel_rst_path.parent), - end=" ... ") + msg = "Updating \"{}\" at \"{}\\\"".format(rel_rst_path.name, rel_rst_path.parent) + print(msg, end=" ... ") - if pandoc_exe is None: - pandoc_exe = '"{}"'.format("C:\\Program Files\\Pandoc\\pandoc.exe") - if not os.path.isfile(pandoc_exe): - pandoc_exe = "pandoc" + pandoc_exe_ = copy.copy(pandoc_exe) + if pandoc_exe_ is None: + pandoc_exe_ = find_executable( + app_name="pandoc.exe", possibilities=["C:\\Program Files\\Pandoc\\pandoc.exe"]) try: subprocess.call( - '{} "{}" -f markdown -t rst -s -o "{}"'.format(pandoc_exe, abs_md_path, abs_rst_path)) + '"{}" "{}" -f markdown -t rst -s -o "{}"'.format(pandoc_exe_, abs_md_path, abs_rst_path)) - print("Done.") if verbose else "" + if verbose: + print("Done.") except FileNotFoundError: import pypandoc pypandoc.convert_file(str(abs_md_path), 'rst', outputfile=str(abs_rst_path), **kwargs) - print("Done.") if verbose else "" + + if verbose: + print("Done.") except Exception as e: print("Failed. {}".format(e))