Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed first year scraper; Improved directory structure. #38

Merged
merged 14 commits into from
Sep 11, 2023
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .env.template

This file was deleted.

7 changes: 2 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
.env
chillzone
t
schedule.json
subjectDetails.json
all_subjects.json
empty_schedule.json
.idea
.vscode/
*.xlsx
frontend/node_modules/
frontend/coverage/
frontend/build/
first_year_scraper/__pycache__
__pycache__/
other-years-scraper/other-years-scraper
Binary file added first-year-scraper/aut2023.pdf
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import camelot
import camelot.io as camelot

def parse_pdf(filename, target_filename):
print("Started processing pdf file. This might take a while...")
Expand All @@ -7,4 +7,4 @@ def parse_pdf(filename, target_filename):
print("Done parsing pdf. exported as: ", target_filename)

if __name__ == "__main__":
parse_pdf("spr2019.pdf","test.xlsx")
parse_pdf("test.pdf","test.xlsx")
63 changes: 63 additions & 0 deletions first-year-scraper/generate_schedule.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from openpyxl import load_workbook
import generate_subjectDetails
import json



def getkey(a, b):
'''returns key for cell[a][b] in excel'''
return str(chr(ord('A')+a-1))+str(b)

def format_cell(original_value):
'''converts cell text to actually usable array of subject name and room numbers'''
if(original_value is None):
pass
return [x for sub in original_value.split("\n") for j in sub.split(" ") for x in j.split(",") if x]

def generate_schedule(valid_sheets, workbook, subjects_dict):
with open("../frontend/src/schedule.json", "r") as json_data:
schedule_dict = json.load(json_data)
with open("../frontend/src/empty_schedule.json", "r") as json_data:
empty_schedule_dict = json.load(json_data)

for sheet in valid_sheets:
worksheet = workbook[sheet]

if(worksheet['N4'].value is not None or worksheet['M4'].value is None):
print("parsing error in sheet {}".format(sheet))

# for aut2023.pdf only page 5 has parsing error in which one column is repeated.
# manually deleting that column:
worksheet.delete_cols(7,1)

worksheet.delete_cols(9,1)
chirag-ghosh marked this conversation as resolved.
Show resolved Hide resolved
for i in range(0, 6):
for j in range(0, 9):
cell = worksheet[getkey(j+4, i+5)].value
if cell is not None and ("NR" in cell or "NC" in cell):
cell_value = format_cell(cell)
for room in cell_value[1::]:
if(schedule_dict[room][i][j]==""):
schedule_dict[room][i][j] = subjects_dict[cell_value[0]][0]
if(room in empty_schedule_dict[i][j]):
empty_schedule_dict[i][j].remove(room)

with open("../frontend/src/schedule.json", "w") as outfile:
json.dump(schedule_dict, outfile, indent=2)

with open("../frontend/src/empty_schedule.json", "w") as outfile:
json.dump(empty_schedule_dict, outfile, indent=2)


def format_excel(filename, subjects_dict):
workbook = load_workbook(filename=filename)
sheets = workbook.sheetnames
valid_sheets = [sheet for sheet in sheets if (workbook[sheet]['D10'].value == "EAA" or workbook[sheet]['E10'].value == "EAA")]

print("count of sheets with timetable : {}".format(len(valid_sheets)))

generate_schedule(valid_sheets, workbook, subjects_dict)


if __name__=="__main__":
format_excel("workbook.xlsx", generate_subjectDetails.generate_subjectDetails())
46 changes: 46 additions & 0 deletions first-year-scraper/generate_subjectDetails.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from openpyxl import load_workbook
import json

def get_cell_value(a, b):
'''returns cell value for cell[a][b] in excel'''
return sheet[str(chr(ord('A')+a-1))+str(b)].value

def generate_subjectDetails():

workbook = load_workbook("workbook.xlsx")
global sheet
sheet = workbook[workbook.sheetnames[2]]

subjects_dict = {}

for i in range(0, 15):
if not ("Laboratory" in get_cell_value(5, i+4)):
if(get_cell_value(6, i+4)[2:3] == "0"):
subjects_dict[get_cell_value(4, i+4)[0:2]] = [
get_cell_value(4, i+4),
get_cell_value(5, i+4).upper(),
]
else:
subjects_dict[get_cell_value(4, i+4)[0:2]] = [
get_cell_value(4, i+4),
get_cell_value(5, i+4).upper(),
]
subjects_dict[get_cell_value(4, i+4)[0:2]+"(T)"] = [
get_cell_value(4, i+4)+"(T)",
get_cell_value(5, i+4).upper()+" TUTORIAL",
]

with open("../frontend/src/subjectDetails.json", "r") as json_data:
subject_details = json.load(json_data)

for key in subjects_dict.keys():
subject_details[subjects_dict[key][0]] = subjects_dict[key][1]

with open("../frontend/src/subjectDetails.json", "w") as outfile:
json.dump(subject_details, outfile, indent=2)

return subjects_dict


if __name__=="__main__":
generate_subjectDetails()
10 changes: 10 additions & 0 deletions first-year-scraper/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import excel_parser
import generate_subjectDetails
import generate_schedule

target_filename = "workbook.xlsx"

filename = input("\nEnter the first year timetable pdf file name: ")

excel_parser.parse_pdf(filename, target_filename)
generate_schedule.format_excel(target_filename, generate_subjectDetails.generate_subjectDetails())
Binary file added first-year-scraper/test.pdf
Binary file not shown.
40 changes: 0 additions & 40 deletions first-year.csv

This file was deleted.

8 changes: 0 additions & 8 deletions first-year.csv.template

This file was deleted.

Binary file removed first_year_scraper/aut2019.pdf
Binary file not shown.
113 changes: 0 additions & 113 deletions first_year_scraper/excel_formatter.py

This file was deleted.

40 changes: 0 additions & 40 deletions first_year_scraper/first_year.csv

This file was deleted.

23 changes: 0 additions & 23 deletions first_year_scraper/main.py

This file was deleted.

Loading