-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathx12scrape.py
executable file
·69 lines (53 loc) · 1.65 KB
/
x12scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/python3
import requests
import pandas as pd
from bs4 import BeautifulSoup
from openpyxl import Workbook
from urllib.parse import urlparse
def get_title_from_url(url):
u = urlparse(url)
path = u.path
split_path = path.split('/')
return split_path[-1:][0].replace('-', ' ').title()
def scrape_codes(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
codelist = soup.find(id="codelist")
if codelist is None:
return None
prod_set_current_codes = codelist.select(".prod_set.current")
codes = {}
for c in prod_set_current_codes:
cas = c.select("td.code")
desc = c.select("td.description")
codes.update({f"{cas[0].get_text()}": f"{desc[0].get_text()}"})
title = get_title_from_url(url)
title = title[:30]
return [codes, title]
def create_codefile(urls):
wb = Workbook()
ws = wb.active
for u in urls:
i = 1
scraped_codes = scrape_codes(u)
if scraped_codes is None:
continue
codes = scraped_codes[0]
title = scraped_codes[1]
for c in codes:
ws.cell(i, 1, c)
ws.cell(i, 2, codes[c])
i += 1
if title:
ws.title = title
ws = wb.create_sheet()
wb.save("x12codes.xlsx")
def get_urls():
page = requests.get("https://nex12.org/index.php/codes")
soup = BeautifulSoup(page.content, 'html.parser')
content = soup.find(id="content")
link_table = content.select(".item-page table")
a_tags = link_table[0].find_all("a")
return [a.get("href") for a in a_tags]
urls = get_urls()
create_codefile(urls)