Skip to content

Commit

Permalink
Fixed issues
Browse files Browse the repository at this point in the history
Signed-off-by: Bart Hanssens <bart.hanssens@bosa.fgov.be>
  • Loading branch information
barthanssens committed Jul 16, 2024
1 parent b78f251 commit 6f07f07
Show file tree
Hide file tree
Showing 7 changed files with 135 additions and 17 deletions.
4 changes: 2 additions & 2 deletions previewer/dependency-reduced-pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<parent>
<artifactId>dcattools</artifactId>
<groupId>be.fedict.dcat</groupId>
<groupId>be.gov.data</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
Expand All @@ -23,7 +23,7 @@
<configuration>
<transformers>
<transformer>
<mainClass>be.fedict.dcat.previewer.Main</mainClass>
<mainClass>be.gov.data.previewer.Main</mainClass>
</transformer>
<transformer />
</transformers>
Expand Down
8 changes: 4 additions & 4 deletions scrapers/dependency-reduced-pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<parent>
<artifactId>dcattools</artifactId>
<groupId>be.fedict.dcat</groupId>
<version>2.27.0</version>
<groupId>be.gov.data</groupId>
<version>3.0.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>scrapers</artifactId>
<name>scrapers</name>
<version>2.27.0</version>
<version>3.0.0</version>
<description>Various scrapers for Belgian data portals</description>
<url>https://data.gov.be</url>
<developers>
Expand Down Expand Up @@ -59,7 +59,7 @@
<addHeader>false</addHeader>
</transformer>
<transformer>
<mainClass>be.fedict.dcat.scrapers.Main</mainClass>
<mainClass>be.gov.data.scrapers.Main</mainClass>
</transformer>
<transformer />
</transformers>
Expand Down
6 changes: 3 additions & 3 deletions scrapers/nbactions-release-profile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
<goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
</goals>
<properties>
<exec.args>-classpath %classpath be.fedict.dcat.scrapers.Main</exec.args>
<exec.args>-classpath %classpath be.gov.data.scrapers.Main</exec.args>
<exec.executable>java</exec.executable>
</properties>
</action>
Expand All @@ -24,7 +24,7 @@
<goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
</goals>
<properties>
<exec.args>-Xdebug -Xrunjdwp:transport=dt_socket,server=n,address=${jpda.address} -classpath %classpath be.fedict.dcat.scrapers.Main</exec.args>
<exec.args>-Xdebug -Xrunjdwp:transport=dt_socket,server=n,address=${jpda.address} -classpath %classpath be.gov.data.scrapers.Main</exec.args>
<exec.executable>java</exec.executable>
<jpda.listen>true</jpda.listen>
</properties>
Expand All @@ -39,7 +39,7 @@
<goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
</goals>
<properties>
<exec.args>-classpath %classpath be.fedict.dcat.scrapers.Main</exec.args>
<exec.args>-classpath %classpath be.gov.data.scrapers.Main</exec.args>
<exec.executable>java</exec.executable>
</properties>
</action>
Expand Down
6 changes: 3 additions & 3 deletions scrapers/nbactions.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
<exec.args>${exec.vmArgs} -classpath %classpath ${exec.mainClass} ${exec.appArgs}</exec.args>
<exec.executable>java</exec.executable>
<exec.workingdir>c:\datagov</exec.workingdir>
<exec.appArgs>-n wallonie</exec.appArgs>
<exec.appArgs>-n vlaanderen</exec.appArgs>
<exec.vmArgs></exec.vmArgs>
<exec.mainClass>be.gov.data.scrapers.Main</exec.mainClass>
</properties>
Expand All @@ -32,7 +32,7 @@
<exec.executable>java</exec.executable>
<jpda.listen>true</jpda.listen>
<exec.workingdir>c:\datagov</exec.workingdir>
<exec.appArgs>-n wallonie</exec.appArgs>
<exec.appArgs>-n vlaanderen</exec.appArgs>
<exec.vmArgs>-agentlib:jdwp=transport=dt_socket,server=n,address=${jpda.address}</exec.vmArgs>
<exec.mainClass>be.gov.data.scrapers.Main</exec.mainClass>
</properties>
Expand All @@ -50,7 +50,7 @@
<exec.args>${exec.vmArgs} -classpath %classpath ${exec.mainClass} ${exec.appArgs}</exec.args>
<exec.executable>java</exec.executable>
<exec.workingdir>c:\datagov</exec.workingdir>
<exec.appArgs>-n wallonie</exec.appArgs>
<exec.appArgs>-n vlaanderen</exec.appArgs>
<exec.vmArgs></exec.vmArgs>
<exec.mainClass>be.gov.data.scrapers.Main</exec.mainClass>
</properties>
Expand Down
116 changes: 116 additions & 0 deletions scrapers/src/main/java/be/gov/data/scrapers/metawal/GeonetMetawal.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Copyright (c) 2024, FPS BOSA DG SD
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package be.gov.data.scrapers.metawal;

import be.gov.data.helpers.Storage;
import be.gov.data.scrapers.Cache;
import be.gov.data.scrapers.Dcat;
import be.gov.data.scrapers.Page;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Properties;
import java.util.Set;
import org.eclipse.rdf4j.repository.RepositoryException;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFParseException;

/**
* MetaWal via DCAT-AP catalog.
*
* @see https://metawal.wallonie.be/
* @author Bart Hanssens
*/
public class GeonetMetawal extends Dcat {
@Override
public void generateDcat(Cache cache, Storage store) throws RepositoryException, MalformedURLException {
Set<URL> urls = cache.retrievePageList();
for (URL url: urls) {
Page page = cache.retrievePage(url).get("all");
// fix buggy input
String content = page.getContent();
content = content.replace(" xsi:type=\"xs:string\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xmlns:xs=\"http://www.w3.org/2001/XMLSchema\"", "");
try (InputStream in = new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8))) {
store.add(in, RDFFormat.RDFXML);
} catch (RDFParseException | IOException ex) {
if (ex.getMessage().contains("Premature end")) {
LOG.warn("Premature end of file in {}", url);
} else {
throw new RepositoryException(url.toString(), ex);
}
}
}
generateCatalog(store);
}

/**
* Scrape DCAT catalog.
*
* @param cache
* @throws IOException
*/
@Override
protected void scrapeCat(Cache cache) throws IOException {
int size = 20;

for(int start = 0; ;start += size) {
URL url = new URL(getBase().toString() + "?startindex=" + start + "&limit=" + size + "&f=dcat");
String xml = makeRequest(url);
if (!xml.contains("Dataset") && !xml.contains("DataService")) {
LOG.info("Last (empty) page");
break;
}
cache.storePage(url, "all", new Page(url, xml));
}
}

@Override
public void scrape() throws IOException {
LOG.info("Start scraping");
Cache cache = getCache();

Set<URL> urls = cache.retrievePageList();
if (urls.isEmpty()) {
scrapeCat(cache);
}
LOG.info("Done scraping");
}

/**
* Constructor.
*
* @param prop
* @throws IOException
*/
public GeonetMetawal(Properties prop) throws IOException {
super(prop);
setName("metawal");
}

}
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# Fix temporal resolution
# Move accessURL to downloadURL

PREFIX dcat: <http://www.w3.org/ns/dcat#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

DELETE
{ ?s dcat:temporalResolution "P1H"^^xsd:duration }
{ ?s dcat:accessURL ?url }
INSERT
{ ?s dcat:temporalResolution "PT1H"^^xsd:duration }
{ ?s dcat:downloadURL ?url }
WHERE
{ ?s dcat:temporalResolution "P1H"^^xsd:duration }
{ ?s dcat:accessURL ?url .
FILTER CONTAINS(STR(?url), "download")
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ healthdata/sparql-theme.qry
healthdata/sparql-accessurl.qry
healthdata/sparql-downloadurl.qry
healthdata/sparql-fix-datetime.qry
healthdata/sparql-fix-datetime2.qry
sparql-date-type.qry
clear-skos.qry
sparql-strip-html.qry
Expand Down

0 comments on commit 6f07f07

Please sign in to comment.