Skip to content

Commit

Permalink
refactor: change spq_init(), spq_perform(), send_sparql() to move req…
Browse files Browse the repository at this point in the history
…uest control to spq_init()! (#176)
  • Loading branch information
maelle authored Sep 21, 2023
1 parent ec36af4 commit 659e055
Show file tree
Hide file tree
Showing 27 changed files with 795 additions and 588 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export(spq)
export(spq_add)
export(spq_arrange)
export(spq_assemble)
export(spq_control_request)
export(spq_count)
export(spq_filter)
export(spq_group_by)
Expand Down
7 changes: 3 additions & 4 deletions R/build_sparql.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#' Assemble query parts into a proper SPARQL query
#' @param .query a list with elements of the query
#' @param endpoint SPARQL endpoint to send the query to
#' @param strict whether to perform some linting on the query,
#' and error in case a problem is detected.
#' @return A query object
Expand All @@ -12,9 +11,9 @@
#' spq_add("?city wdt:P1082 ?pop") %>%
#' spq_assemble() %>%
#' cat()
spq_assemble = function(.query,
endpoint = "Wikidata",
strict = TRUE) {
spq_assemble = function(.query, strict = TRUE) {

endpoint = .query[["endpoint"]]

.query = spq_prefix(.query, auto = TRUE, prefixes = NULL)

Expand Down
161 changes: 105 additions & 56 deletions R/send_sparql.R
Original file line number Diff line number Diff line change
@@ -1,79 +1,128 @@
#' Send SPARQL query to endpoint and get tibble as a result
#' @param .query a string corresponding to a SPARQL query
#' @param query_string a string corresponding to a SPARQL query
#' @param endpoint a string or url corresponding to a SPARQL endpoint. Defaults to "Wikidata"
#' @param user_agent a string indicating the user agent to send with the query.
#' @inheritParams httr2::req_retry
#' @param timeout maximum number of seconds to wait (`httr2::req_timeout()`).
#' @param request_type a string indicating how the query should be sent: in the
#' @param user_agent `r lifecycle::badge('deprecated')` a string indicating the user agent to send with the query.
#' @param max_tries,max_seconds `r lifecycle::badge('deprecated')` Cap the maximal number of
#' attemps with `max_tries` or the total elapsed time from the first request with `max_seconds`.
#' @param timeout `r lifecycle::badge('deprecated')` maximum number of seconds to wait (`httr2::req_timeout()`).
#' @param request_type `r lifecycle::badge('deprecated')` a string indicating how the query should be sent: in the
#' URL (`url`, default, most common) or as a body form (`body-form`).
#' @param dry_run Boolean indicating whether to return the output of `httr2::req_dry_run()`
#' rather than of `httr2::req_perform`. Useful for debugging.
#' rather than of `httr2::req_perform()`. Useful for debugging.
#' @inheritParams spq_init
#' @examples
#'metro_query='SELECT ?item ?itemLabel ?coords
#'{
#' ?item wdt:P361 wd:Q1552;
#' wdt:P625 ?coords.
#' OPTIONAL{?item wdt:P1619 ?date.}
#' SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
#'} ORDER BY ?itemLabel
#''
#'send_sparql(metro_query)
#' \dontrun{
#' query_string = spq_init() %>%
#' spq_add("?city wdt:P31/wdt:P279* wd:Q486972") %>%
#' spq_label(city) %>%
#' spq_mutate(coords = wdt::P625(city),
#' .within_distance=list(center=c(long=4.84,lat=45.76),
#' radius=5)) %>%
#' spq_assemble()
#' send_sparql(query_string)
#' }
#' @details
#'
#' Control the way the query is performed via the `control_request`
#' argument of `spq_init()`.
#' This way you can create a basic spq object with all the correct options
#' corresponding to the SPARQL service you are using, and then use it as
#' the basis of all your subsequent glitter pipelines.
#'
#'
#' @export
send_sparql = function(.query,
endpoint = "Wikidata",
user_agent = getOption("glitter.ua", "glitter R package (https://github.com/lvaudor/glitter)"),
max_tries = getOption("glitter.max_tries", 3L),
max_seconds = getOption("glitter.max_seconds", 120L),
timeout = getOption("glitter.timeout", 1000L),
request_type = c("url", "body-form"),
dry_run = FALSE) {

if (!inherits(user_agent, "character")) {
cli::cli_abort("{.field user_agent} must be a string.")
send_sparql = function(query_string,
endpoint = NULL,
user_agent = lifecycle::deprecated(),
max_tries = lifecycle::deprecated(),
max_seconds = lifecycle::deprecated(),
timeout = lifecycle::deprecated(),
request_type = lifecycle::deprecated(),
dry_run = FALSE,
request_control = NULL) {

if (lifecycle::is_present(user_agent)) {
lifecycle::deprecate_warn(
"0.3.0",
"spq_perform(user_agent)",
"spq_request_control(user_agent)",
details = control_explanation()
)
} else {
user_agent = request_control[["user_agent"]]
}

if (!inherits(max_tries, "integer")) {
cli::cli_abort("{.field max_tries} must be a integer")
if (lifecycle::is_present(max_tries)) {
lifecycle::deprecate_warn(
"0.3.0",
"spq_perform(max_tries)",
"spq_request_control(max_tries)",
details = control_explanation()
)
} else {
max_tries = request_control[["max_tries"]]
}

if (!inherits(max_seconds, "integer")) {
cli::cli_abort("{.field max_seconds} must be a integer")
if (lifecycle::is_present(max_seconds)) {
lifecycle::deprecate_warn(
"0.3.0",
"spq_perform(max_seconds)",
"spq_request_control(max_seconds)",
details = control_explanation()
)
} else {
max_seconds = request_control[["max_seconds"]]
}

if (!inherits(timeout, "integer")) {
cli::cli_abort("{.field timeout} must be a integer")
if (lifecycle::is_present(timeout)) {
lifecycle::deprecate_warn(
"0.3.0",
"spq_perform(timeout)",
"spq_request_control(timeout)",
details = control_explanation()
)
} else {
timeout = request_control[["timeout"]]
}

request_type <- rlang::arg_match(request_type, c("url", "body-form"))

endpoint = tolower(endpoint)
if (lifecycle::is_present(request_type)) {
lifecycle::deprecate_warn(
"0.3.0",
"spq_perform(request_type)",
"spq_request_control(request_type)",
details = control_explanation()
)
} else {
request_type = request_control[["request_type"]]
}

# if endpoint wikidata, use WikidataQueryServiceR::query_wikidata()
if (endpoint == "wikidata") {
return(purrr::quietly(WikidataQueryServiceR::query_wikidata)(.query)$result)
if (endpoint == "https://query.wikidata.org/") {
return(purrr::quietly(WikidataQueryServiceR::query_wikidata)(query_string)$result)
}
# else, use httr2

# if endpoint passed as name, get url
usual_endpoint_info = usual_endpoints %>%
dplyr::filter(.data$name == endpoint)
url = if (nrow(usual_endpoint_info) > 0) {
dplyr::pull(usual_endpoint_info, .data$url)
} else {
endpoint
}

initial_request = httr2::request(url) %>%
initial_request = httr2::request(endpoint) %>%
httr2::req_method("POST") %>%
httr2::req_headers(Accept = "application/sparql-results+json") %>%
httr2::req_user_agent(user_agent) %>%
httr2::req_retry(max_tries = max_tries, max_seconds = max_seconds) %>%
httr2::req_timeout(timeout)

request <- if (request_type == "url") {
httr2::req_url_query(initial_request, query = .query)
rate = request_control[["rate"]]
if (!is.null(rate)) {
realm = request_control[["realm"]]
initial_request = httr2::req_throttle(
initial_request,
rate = rate,
realm = realm
)
}

request = if (request_type == "url") {
httr2::req_url_query(initial_request, query = query_string)
} else {
httr2::req_body_form(initial_request, query = .query)
httr2::req_body_form(initial_request, query = query_string)
}

if (dry_run) {
Expand All @@ -82,13 +131,13 @@ send_sparql = function(.query,

resp <- httr2::req_perform(request)

httr2::resp_check_status(resp)
httr2::resp_check_status(resp)

if (httr2::resp_content_type(resp) != "application/sparql-results+json") {
rlang::abort("Not right response type") #TODO:better message, more flexibility
}
if (httr2::resp_content_type(resp) != "application/sparql-results+json") {
rlang::abort("Not right response type") #TODO:better message, more flexibility
}

content = httr2::resp_body_json(resp)
content = httr2::resp_body_json(resp)

# Adapted from https://github.com/wikimedia/WikidataQueryServiceR/blob/accff89a06ad4ac4af1bef369f589175c92837b6/R/query.R#L56
if (length(content$results$bindings) > 0) {
Expand All @@ -103,7 +152,7 @@ send_sparql = function(.query,
type,
character = x,
# easier for now as dbpedia can return different things with the same name
integer = ifelse(endpoint == "dbpedia", x, as.integer(x)),
integer = ifelse(endpoint == "https://dbpedia.org/sparql", x, as.integer(x)),
datetime = anytime::anytime(x),
x
)
Expand Down
74 changes: 74 additions & 0 deletions R/spq_control_request.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#' Create the request control object for `spq_init()`
#'
#' @param user_agent a string indicating the user agent to send with the query.
#' @param max_tries,max_seconds Cap the maximal number of
#' attemps with `max_tries` or the total elapsed time from the first request
#' with `max_seconds`.
#' @param timeout maximum number of seconds to wait (`httr2::req_timeout()`).
#' @param request_type a string indicating how the query should be sent: in the
#' URL (`url`, default, most common) or as a body form (`body-form`).
#' @inheritParams httr2::req_throttle
#'
#' @return A list to be used in `spq_init()`'s `request_control` argument.
#' @export
#'
#' @examples
#' # Defaults
#' spq_control_request()
#' # Tweaking values
#' spq_control_request(
#' user_agent = "Jane Doe https://example.com",
#' max_tries = 1L,
#' max_seconds = 10L,
#' timeout = 10L,
#' request_type = "url"
#' )
spq_control_request <- function(user_agent = getOption("glitter.ua", "glitter R package (https://github.com/lvaudor/glitter)"),
max_tries = getOption("glitter.max_tries", 3L),
max_seconds = getOption("glitter.max_seconds", 120L),
timeout = getOption("glitter.timeout", 1000L),
request_type = c("url", "body-form"),
rate = NULL,
realm = NULL) {


if (!is.character(user_agent)) {
cli::cli_abort("Must provide a character as {.arg user_agent}.")
}

if (!is.integer(max_tries)) {
cli::cli_abort(c(
"Must provide an integer as {.arg max_tries}.",
i = "You provided a {.val {typeof(max_tries)}}."
))
}

if (!is.integer(max_seconds)) {
cli::cli_abort(c(
"Must provide an integer as {.arg max_seconds}.",
i = "You provided a {.val {typeof(max_seconds)}}."
))
}

if (!is.integer(timeout)) {
cli::cli_abort(c(
"Must provide an integer as {.arg timeout}.",
i = "You provided a {.val {typeof(timeout)}}."
))
}

request_type = rlang::arg_match(request_type, c("url", "body-form"))
structure(
list(
user_agent = user_agent,
max_tries = max_tries,
max_seconds = max_seconds,
timeout = timeout,
request_type = request_type,
rate = rate,
realm = realm
),
class = "glitter_request_control"
)

}
34 changes: 32 additions & 2 deletions R/spq_init.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
#' Initialize a query object.
#'
#' @param endpoint Endpoint, either name if it is in `usual_endpoints`,
#' or an URL
#' @param request_control An object as returned by [`spq_control_request()`]
#'
#' @return A query object
#' @export
#' @section Printing:
Expand All @@ -15,7 +20,30 @@
#' You can also turn off the cli behavior by setting the environment variable
#' `"GLITTER.NOCLI"` to any non-empty string.
#' That's what we do in glitter snapshot tests.
spq_init = function(){
spq_init = function(
endpoint = "wikidata",
request_control = spq_control_request(
user_agent = getOption("glitter.ua", "glitter R package (https://github.com/lvaudor/glitter)"),
max_tries = getOption("glitter.max_tries", 3L),
max_seconds = getOption("glitter.max_seconds", 120L),
timeout = getOption("glitter.timeout", 1000L),
request_type = c("url", "body-form")
)
) {
if (!inherits(request_control, "glitter_request_control")) {
cli::cli_abort("{.arg request_control} must be created by {.fun spq_control_request}.")
}

# if endpoint passed as name, get url
endpoint = tolower(endpoint)
usual_endpoint_info = usual_endpoints %>%
dplyr::filter(.data$name == endpoint)
endpoint = if (nrow(usual_endpoint_info) > 0) {
dplyr::pull(usual_endpoint_info, .data$url)
} else {
endpoint
}

query = list(
prefixes_provided = tibble::tibble(name = NULL, url = NULL),
prefixes_used = NULL,
Expand All @@ -27,7 +55,9 @@ spq_init = function(){
limit = NULL,
group_by = NULL,
order_by = NULL,
offset = NULL
offset = NULL,
endpoint = endpoint,
request_control = request_control
)

structure(query, class = c("sparqle_query", "list"))
Expand Down
Loading

0 comments on commit 659e055

Please sign in to comment.