-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathconfig.py
56 lines (40 loc) · 2.64 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# CONFIGURATION VALUES
# Default language when scraping content (BCP 47)
DEFAULT_LANG = 'en' # Default: 'en'
# Whether full content should be scraped, in addition to basic metadata (default language only) (warning: scraping full content will send up to 1600 http requests – one for each chapter)
SCRAPE_FULL_CONTENT = True # Default: True
# Whether basic metadata should be scraped for all languages (if False, only metadata in the default language will be included)
SCRAPE_METADATA_FOR_ALL_LANGUAGES = False # Default: False
# The script will pause between requests to avoid hitting the Church server too frequently
SECONDS_TO_PAUSE_BETWEEN_REQUESTS = 1 # Default: 1
# Number of spaces to indent in JSON output
JSON_INDENT = 2 # Default: 2
# Whether test data should be used (only includes a subset of chapters)
USE_TEST_DATA = False # Default: False
# The following values are only applicable when SCRAPE_FULL_CONTENT is True
if SCRAPE_FULL_CONTENT:
# Full content output format
OUTPUT_AS_JSON = True # Default: True
OUTPUT_AS_HTML = True # Default: True
OUTPUT_AS_MD = True # Default: True
OUTPUT_AS_TXT = True # Default: True
OUTPUT_AS_CSV = True # Default: True
OUTPUT_AS_TSV = True # Default: True
OUTPUT_AS_SQL_MYSQL = True # Default: True
OUTPUT_AS_SQL_SQLITE = True # Default: True
# Whether full content output should be split by chapter and put into a nested directory structure (only applicable for JSON output)
SPLIT_JSON_BY_CHAPTER = True # Default: True
# Whether full content output should be minified (only applicable for JSON output)
MINIFY_JSON = False # Default: False
# Whether spans with CSS classes should be converted to simple HTML tags (only applicable for HTML output)
BASIC_HTML = False # Default: False
# Whether links to images (Abraham facsimiles) should be included
INCLUDE_IMAGES = True # Default: True
# Whether links, footnotes, chapter summaries, and other potentially copyrighted content should be included (warning: this is intended for personal use only – copyrighted content should not be distributed online or used in any public or commercial product, without permission from the Church)
INCLUDE_COPYRIGHTED_CONTENT = False # Default: False
# Whether audio, video, PDF, and other related media information should be included (requires Playwright for Python, and may run a little slower). You can install Playwright by running these commands in Terminal:
# pip3 install pytest-playwright
# playwright install
INCLUDE_MEDIA_INFO = False # Default: False
# Whether a CSS stylesheet should be added to the output files
ADD_CSS_STYLESHEET = True # Default: True