Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BY] Use WFS to get data #142

Merged
merged 1 commit into from
Jul 15, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 39 additions & 62 deletions jedeschule/spiders/bayern.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,50 @@
# -*- coding: utf-8 -*-
from urllib import parse

import xml.etree.ElementTree as ET
import scrapy
from scrapy import Item
from scrapy.shell import inspect_response

from jedeschule.items import School
from jedeschule.utils import get_first_or_none, cleanjoin


class BayernSpider(scrapy.Spider):
name = "bayern"
# allowed_domains = ["https://www.km.bayern.de/schueler/schulsuche.html"]
start_urls = ['https://www.km.bayern.de/schueler/schulsuche.html?s=&t=9999&r=9999&o=9999&u=0&m=3&seite=1']

def parse(self, response):
number_of_pages = response.css("div.schulsuche > div > p.Right a:last-child::text").extract_first()
# number_of_pages = 2
for i in range(1, int(number_of_pages) + 1):
url = "https://www.km.bayern.de/schueler/schulsuche.html?s=&t=9999&r=9999&o=9999&u=0&m=3&seite={page}"
yield scrapy.Request(url.format(page=i),
callback=self.parse_list)

def parse_list(self, response):
links = response.css('.ListSchools a::attr(href)').extract()
for link in links:
yield scrapy.Request(response.urljoin(link), callback=self.parse_detail)

def get_lat_lon(self, response):
try:
geoportal_href = response.css("article > a::attr(href)").extract_first()
querystring = parse.parse_qs(geoportal_href)
return querystring['N'][0], querystring['E'][0]
except:
return None, None

def parse_detail(self, response):
# inspect_response(response, self)
collection = {}
text = response.css("article ::text")
street, city = response.css("article > p")[0].css("::text").extract()
collection['street'] = street
collection['city'] = city
collection['name'] = cleanjoin(response.css('article h1::text').extract(), "")
collection['phone'] = get_first_or_none(text.re("Telefon: ([0-9 /]+)"))
collection['fax'] = get_first_or_none(text.re("Fax: ([0-9 /]+)"))
collection['web'] = response.css("article a::attr(href)").extract_first()
collection['number'] = get_first_or_none(text.re("Schulnummer: ([0-9]+)"))
collection['school_type'] = get_first_or_none(text.re("Schulart: (.+)"))
collection['type'] = get_first_or_none(text.re("Rechtlicher Status: (.+)"))
collection['teachers'] = get_first_or_none(text.re("Hauptamtliche Lehrkräfte: ([0-9]+)"))
collection['students'] = get_first_or_none(text.re("Schüler: ([0-9]+)"))
collection['url'] = response.url
collection['latitude'], collection['longitude'] = self.get_lat_lon(response)
yield collection
start_urls = ['https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetCapabilities']

def parse(self, response, **kwargs):
tree = ET.fromstring(response.body)
base_url = 'https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetFeature&srsname=EPSG:4326&typename='
for feature_type in tree.iter("{http://www.opengis.net/wfs/2.0}FeatureType"):
feature = feature_type.findtext("{http://www.opengis.net/wfs/2.0}Title")
yield scrapy.Request(f"{base_url}{feature}", callback=self.parse_resource, cb_kwargs={"feature": feature})

def parse_resource(self, response, feature):
tree = ET.fromstring(response.body)
namespaces = {
"gml": "http://www.opengis.net/gml/3.2",
"schul": "http://gdi.bayern/brbschul"
}
for school in tree.iter(feature.replace("schul:", "{http://gdi.bayern/brbschul}")):
data_elem = {'id': school.attrib["{http://www.opengis.net/gml/3.2}id"]}

for entry in school:
if entry.tag == "{http://gdi.bayern/brbschul}geometry":
lat, lon = entry.findtext(
"gml:Point/gml:pos", namespaces=namespaces
).split(" ")
data_elem["lat"] = lat
data_elem["lon"] = lon
continue
# strip the namespace before returning
data_elem[entry.tag.split("}", 1)[1]] = entry.text
yield data_elem

@staticmethod
def normalize(item: Item) -> School:
zip_code, *city_parts = item.get('city').split()
return School(name=item.get('name'),
phone=item.get('phone'),
fax=item.get('fax'),
website=item.get('web'),
address=item.get('street'),
city=' '.join(city_parts),
zip=zip_code,
school_type=item.get('school_type'),
legal_status=item.get('type'),
id='BY-{}'.format(item.get('number')),
latitude=item.get('latitude'),
longitude=item.get('longitude')
)
return School(name=item.get('schulname'),
address=item.get('strasse'),
city=item.get('ort'),
school_type=item.get('schulart'),
zip=item.get('postleitzahl'),
id='BY-{}'.format(item.get('id')),
latitude=item.get('lat'),
longitude=item.get('lon')
)
Loading