forked from pratikrelekar/StreamLit_SEC
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
137 lines (118 loc) · 5.05 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import streamlit as st
from sec_edgar_downloader import Downloader
from fuzzywuzzy import process
import os
from edgar import Edgar
from functools import lru_cache
import concurrent.futures
from minio import Minio
import shutil
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import re
load_dotenv() # read .env file variables
# MinIO configurations
MINIO_ENDPOINT = 's3.dsrs.illinois.edu'
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY')
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY')
MINIO_BUCKET_NAME = '10-k'
MINIO_SECURE = True
# Initialize Minio client
minio_client = Minio(
MINIO_ENDPOINT,
access_key=MINIO_ACCESS_KEY,
secret_key=MINIO_SECRET_KEY,
secure=MINIO_SECURE
)
# Initialize Edgar objects
edgar_obj = Edgar()
edgar_download = Downloader("DummyCompany", "dummy@email.com")
# Preprocess all companies for faster search
company_index = {}
all_companies = list(edgar_obj.all_companies_dict.keys())
for company in all_companies:
prefix = company[:3].upper()
if prefix in company_index:
company_index[prefix].append(company)
else:
company_index[prefix] = [company]
@lru_cache(maxsize=1000)
def get_matching_companies(name):
prefix = name[:3].upper()
potential_matches = company_index.get(prefix, [])
return [match[0] for match in process.extract(name, potential_matches, limit=5)]
def is_cik(input_str):
return input_str.isdigit() and len(input_str) == 10
def move_and_merge(src, dest):
if not os.path.exists(dest):
os.rename(src, dest)
else:
for root, dirs, files in os.walk(src):
for file in files:
src_file_path = os.path.join(root, file)
dest_file_path = os.path.join(dest, file)
if os.path.exists(dest_file_path):
os.remove(dest_file_path)
os.rename(src_file_path, dest_file_path)
for d in dirs:
shutil.rmtree(os.path.join(root, d))
os.rmdir(src)
def clean_file_with_soup(filepath):
with open(filepath, 'r', encoding='utf-8') as ff:
content = ff.read()
soup = BeautifulSoup(content, 'lxml')
main_content = soup.find('text')
if main_content:
plain_text = main_content.get_text(separator="\n")
lines = plain_text.split("\n")
cleaned_lines = [line for line in lines if not re.match(r'^[a-zA-Z0-9-_]+$', line)]
return "\n".join(cleaned_lines)
return content
def download_and_upload_10k_files(company_name, year):
try:
cik = edgar_obj.get_cik_by_company_name(company_name)
edgar_download.get("10-K", cik, after=f"{year}-01-01", before=f"{year}-12-31")
src_folder = os.path.join("sec-edgar-filings", cik)
dest_folder = os.path.join("sec-edgar-filings", company_name.replace(" ", "_"))
if not os.path.exists(src_folder):
return f"No 10-K filings were downloaded for {company_name} in {year}.", None
move_and_merge(src_folder, dest_folder)
for root, _, files in os.walk(dest_folder):
for file in files:
file_path = os.path.join(root, file)
cleaned_content = clean_file_with_soup(file_path)
with open(file_path, 'w', encoding='utf-8') as cleaned_file:
cleaned_file.write(cleaned_content)
minio_path = os.path.join("10-k", company_name.replace(" ", "_"), str(year), f"{cik}_{year}_{file}")
minio_client.fput_object(MINIO_BUCKET_NAME, minio_path, file_path)
url = minio_client.presigned_get_object(MINIO_BUCKET_NAME, minio_path)
return f"Downloaded and uploaded 10-K filings for {company_name} in {year}.", url
except Exception as e:
return f"Error: {str(e)}", None
# Streamlit App
st.title('SEC 10-K Filings')
input_data = st.text_input("Enter the company name or CIK:")
if input_data:
if is_cik(input_data):
cik = input_data
company_name = edgar_obj.get_company_name_by_cik(cik)
if not company_name:
st.write("Invalid CIK provided.")
else:
matches = get_matching_companies(input_data)
if matches:
matches_with_cik = [f"{match} ({edgar_obj.get_cik_by_company_name(match)})" for match in matches]
selected = st.selectbox('Select the correct company:', matches_with_cik)
company_name = selected.split(" (")[0]
cik = edgar_obj.get_cik_by_company_name(company_name)
else:
st.write("No matching companies found.")
selected_year = st.selectbox('Select the year:', list(range(1993, 2023)))
if st.button('Download 10-K filings'):
with st.spinner('Processing... Downloading from SEC, cleaning, and uploading. Please wait...'):
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(download_and_upload_10k_files, company_name, selected_year)
result, url = future.result()
st.write(result)
if url:
st.write(f"[Download the cleaned file here.]({url})")