Datenschule · k-nut · Jul 15, 2024 · Jul 15, 2024
diff --git a/jedeschule/spiders/bayern.py b/jedeschule/spiders/bayern.py
@@ -1,73 +1,50 @@
-# -*- coding: utf-8 -*-
-from urllib import parse
-
+import xml.etree.ElementTree as ET
 import scrapy
 from scrapy import Item
-from scrapy.shell import inspect_response
 
 from jedeschule.items import School
-from jedeschule.utils import get_first_or_none, cleanjoin
 
 
 class BayernSpider(scrapy.Spider):
     name = "bayern"
-    # allowed_domains = ["https://www.km.bayern.de/schueler/schulsuche.html"]
-    start_urls = ['https://www.km.bayern.de/schueler/schulsuche.html?s=&t=9999&r=9999&o=9999&u=0&m=3&seite=1']
-
-    def parse(self, response):
-        number_of_pages = response.css("div.schulsuche > div > p.Right a:last-child::text").extract_first()
-        # number_of_pages = 2
-        for i in range(1, int(number_of_pages) + 1):
-            url = "https://www.km.bayern.de/schueler/schulsuche.html?s=&t=9999&r=9999&o=9999&u=0&m=3&seite={page}"
-            yield scrapy.Request(url.format(page=i),
-                                 callback=self.parse_list)
-
-    def parse_list(self, response):
-        links = response.css('.ListSchools a::attr(href)').extract()
-        for link in links:
-            yield scrapy.Request(response.urljoin(link), callback=self.parse_detail)
-
-    def get_lat_lon(self, response):
-        try:
-            geoportal_href = response.css("article > a::attr(href)").extract_first()
-            querystring = parse.parse_qs(geoportal_href)
-            return querystring['N'][0], querystring['E'][0]
-        except:
-            return None, None
-
-    def parse_detail(self, response):
-        # inspect_response(response, self)
-        collection = {}
-        text = response.css("article ::text")
-        street, city = response.css("article > p")[0].css("::text").extract()
-        collection['street'] = street
-        collection['city'] = city
-        collection['name'] = cleanjoin(response.css('article h1::text').extract(), "")
-        collection['phone'] = get_first_or_none(text.re("Telefon: ([0-9 /]+)"))
-        collection['fax'] = get_first_or_none(text.re("Fax: ([0-9 /]+)"))
-        collection['web'] = response.css("article a::attr(href)").extract_first()
-        collection['number'] = get_first_or_none(text.re("Schulnummer: ([0-9]+)"))
-        collection['school_type'] = get_first_or_none(text.re("Schulart: (.+)"))
-        collection['type'] = get_first_or_none(text.re("Rechtlicher Status: (.+)"))
-        collection['teachers'] = get_first_or_none(text.re("Hauptamtliche Lehrkräfte: ([0-9]+)"))
-        collection['students'] = get_first_or_none(text.re("Schüler: ([0-9]+)"))
-        collection['url'] = response.url
-        collection['latitude'], collection['longitude'] = self.get_lat_lon(response)
-        yield collection
+    start_urls = ['https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetCapabilities']
+
+    def parse(self, response, **kwargs):
+        tree = ET.fromstring(response.body)
+        base_url = 'https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetFeature&srsname=EPSG:4326&typename='
+        for feature_type in tree.iter("{http://www.opengis.net/wfs/2.0}FeatureType"):
+            feature = feature_type.findtext("{http://www.opengis.net/wfs/2.0}Title")
+            yield scrapy.Request(f"{base_url}{feature}", callback=self.parse_resource, cb_kwargs={"feature": feature})
+
+    def parse_resource(self, response, feature):
+        tree = ET.fromstring(response.body)
+        namespaces = {
+            "gml": "http://www.opengis.net/gml/3.2",
+            "schul": "http://gdi.bayern/brbschul"
+        }
+        for school in tree.iter(feature.replace("schul:", "{http://gdi.bayern/brbschul}")):
+            data_elem = {'id': school.attrib["{http://www.opengis.net/gml/3.2}id"]}
+
+            for entry in school:
+                if entry.tag == "{http://gdi.bayern/brbschul}geometry":
+                    lat, lon = entry.findtext(
+                        "gml:Point/gml:pos", namespaces=namespaces
+                    ).split(" ")
+                    data_elem["lat"] = lat
+                    data_elem["lon"] = lon
+                    continue
+                # strip the namespace before returning
+                data_elem[entry.tag.split("}", 1)[1]] = entry.text
+            yield data_elem
 
     @staticmethod
     def normalize(item: Item) -> School:
-        zip_code, *city_parts = item.get('city').split()
-        return School(name=item.get('name'),
-                      phone=item.get('phone'),
-                      fax=item.get('fax'),
-                      website=item.get('web'),
-                      address=item.get('street'),
-                      city=' '.join(city_parts),
-                      zip=zip_code,
-                      school_type=item.get('school_type'),
-                      legal_status=item.get('type'),
-                      id='BY-{}'.format(item.get('number')),
-                      latitude=item.get('latitude'),
-                      longitude=item.get('longitude')
-                      )
+        return School(name=item.get('schulname'),
+                      address=item.get('strasse'),
+                      city=item.get('ort'),
+                      school_type=item.get('schulart'),
+                      zip=item.get('postleitzahl'),
+                      id='BY-{}'.format(item.get('id')),
+                      latitude=item.get('lat'),
+                      longitude=item.get('lon')
+                      )