-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpython_cisteni.py
133 lines (101 loc) · 3.24 KB
/
python_cisteni.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import pandas as pd
import csv
data = []
with open("all_in_one.csv", mode="r", encoding="utf-8") as file:
csv_reader = csv.reader(file)
for line in csv_reader:
data.append(line)
# nacte soubor do seznamu 'data'
headers = data[0]
data = data[1:]
df = pd.DataFrame(data, columns=headers)
# vytvori dataframe a oddeli hlavicku od dat
cities = [
"Praha",
"Brno",
"Ostrava",
"Plzeň",
"Liberec",
"Olomouc",
"České Budějovice",
"Hradec Králové",
"Pardubice",
"Ústí nad Labem",
"Karlovy Vary",
"Jihlava",
"Zlín",
]
arrangements = ["1+0", "1+kk", "1+1", "2+kk", "2+1", "3+kk", "3+1", "4+1", "4+kk"]
# seznamy mest a dispozici, ktere nas zajimaji
filtered_data = []
for index, row in df.iterrows():
condition_1 = "idnes" not in row["data_url"]
condition_2 = "apartment" in row["data_type"]
condition_3 = "auction" not in row["data_offerType"]
condition_4 = any(city.lower() in row["data_address"].lower() for city in cities)
condition_5 = row["data_arrangement"] in arrangements
if condition_1 and condition_2 and condition_3 and condition_4 and condition_5:
filtered_data.append(row)
filtered_df = pd.DataFrame(filtered_data)
# filtruje data podle zadanych podminek a ulozi je do noveho dataframe
columns = [
"createdAt",
"data_priceTotal",
"data_price",
"data_priceType",
"data_arrangement",
"data_livingArea",
"data_address",
"data_energyClass",
"id",
"data_city",
"data_buildingType",
"data_district",
"data_offerType",
"data_equipment",
"data_ownership",
"data_propertyState",
"data_type",
"data_url",
"isLive",
"markAsDeadAt",
]
all_in_one_df = filtered_df[columns]
# ulozi jen vypsane sloupce
def extract_realitka(url):
parts = url.split(".")
if len(parts) > 1:
return parts[1]
return None
all_in_one_df["realitka"] = all_in_one_df["data_url"].apply(extract_realitka)
# extrahuje nazev realitniho webu z URL
new_columns = [
col[5:] if col.startswith("data_") else col for col in all_in_one_df.columns
]
all_in_one_df.columns = new_columns
# odstrani prefix 'data_' z nazvu sloupcu
def determine_city(row):
address = str(row["address"])
parts = address.split(",")
for city in cities:
if address.lower().startswith(city.lower()) and len(address.split()[0]) == len(
city
):
return city
if city.lower() in parts[0].lower():
return city
if len(parts) > 1 and city.lower() in parts[1].lower():
if not parts[1].lower().lstrip().startswith("okres"):
return city
all_in_one_df["city"] = all_in_one_df.apply(determine_city, axis=1)
# urci mesto podle sloupce adresy a zalozi novy sloupce 'city'
cleaned_data = all_in_one_df.dropna(subset=["city", "price"])
cleaned_data = cleaned_data.drop(columns=["address", "url"])
# odstrani radky s prazdnymi hodnotami a prebytecne sloupce
cleaned_data.to_csv("all_in_one_output.csv", index=False)
# ulozi vycisteny dataframe do csv
with open("all_in_one_output.csv", mode="r", encoding="utf-8") as file:
csv_reader = csv.reader(file)
for line in csv_reader:
data.append(line)
# nacte vysledny soubor pro provedeni kontroly