Skip to content

Commit

Permalink
Fix search and index to ensure proper functionality
Browse files Browse the repository at this point in the history
- Fix before_dataset_index to use list instead of json lists stringify, remove empty extras keys and fix subfields.
- Fix facet search and custom facets
- Fix templates to include new helpers
- Update search to be compatible with CKAN >2.10
- Fix facet search operators.
  • Loading branch information
mjanez committed Sep 10, 2024
1 parent 4b5c4b7 commit c294495
Show file tree
Hide file tree
Showing 5 changed files with 270 additions and 156 deletions.
43 changes: 26 additions & 17 deletions ckanext/schemingdcat/faceted.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import logging
import json

import ckan.plugins as plugins
from ckan.common import request

import ckanext.schemingdcat.config as sdct_config
from ckanext.schemingdcat.utils import get_facets_dict
import logging
from ckanext.schemingdcat.helpers import schemingdcat_get_current_lang
import ckanext.schemingdcat.utils as utils
from ckanext.schemingdcat.utils import deprecated

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -32,18 +37,23 @@ def dataset_facets(self,
else:
return facets_dict

def _custom_facets(self,
facets_dict,
package_type):

lang_code = request.environ['CKAN_LANG']

def _custom_facets(self, facets_dict, package_type):
lang_code = schemingdcat_get_current_lang()

# Initialize cache dictionary if it does not exist
if not hasattr(sdct_config, 'dataset_custom_facets'):
sdct_config.dataset_custom_facets = {}

# Check if we already cached the results for the current language
if lang_code in sdct_config.dataset_custom_facets:
return sdct_config.dataset_custom_facets[lang_code]

_facets_dict = {}
for facet in self.facet_list:
# Look for the field label in the scheming file.
# If it's not there, use the default dictionary provided
scheming_item = get_facets_dict().get(facet)

scheming_item = utils.get_facets_dict().get(facet)
if scheming_item:
# Retrieve the corresponding label for the used language
_facets_dict[facet] = scheming_item.get(lang_code)
Expand All @@ -57,17 +67,16 @@ def _custom_facets(self,
else:
log.warning(
"Unable to find a valid label for the field '%s' when faceting" % facet)

if not _facets_dict[facet]:
_facets_dict[facet] = plugins.toolkit._(facet)

else:
_facets_dict[facet] = plugins.toolkit._(facets_dict.get(facet))

# tag_key = 'tags_' + lang_code
# facets_dict[tag_key] = plugins.toolkit._('Tag')
#FIXME: FOR COMMON TAG FACET
#log.debug("dataset_facets._facets_dict: {0}".format(_facets_dict))

# Cache the results for the current language
sdct_config.dataset_custom_facets[lang_code] = _facets_dict

return _facets_dict

def group_facets(self,
Expand Down
193 changes: 169 additions & 24 deletions ckanext/schemingdcat/package_controller.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
from ckan.common import request
import json
import ckan.plugins as plugins

from ckanext.scheming.plugins import (
SchemingDatasetsPlugin
)

import ckanext.schemingdcat.config as sdct_config
import ckanext.schemingdcat.utils as utils

import logging
import sys
import ast

FACET_OPERATOR_PARAM_NAME = '_facet_operator'
FACET_SORT_PARAM_NAME = '_%s_sort'
Expand All @@ -15,7 +20,7 @@

class PackageController():

plugins.implements(plugins.IPackageController)
plugins.implements(plugins.IPackageController, inherit=True)

default_facet_operator = sdct_config.default_facet_operator

Expand All @@ -37,22 +42,31 @@ def authz_remove_role(self, object_role):
def delete(self, entity):
pass


# CKAN < 2.10
def before_search(self, search_params):
"""Modifies search parameters before executing a search.
return self.before_dataset_search(search_params)

# CKAN >= 2.10
def before_dataset_search(self, search_params):
"""Modifies search parameters before executing a search.
This method adjusts the 'fq' (filter query) parameter based on the 'facet.field' value in the search parameters. If 'facet.field' is a list, it iterates through each field, applying the '_facet_search_operator' to modify 'fq'. If 'facet.field' is a string, it directly applies the '_facet_search_operator'. If 'facet.field' is not present or is invalid, no modification is made.
Args:
search_params (dict): The search parameters to be modified. Expected to contain 'facet.field' and 'fq'.
Returns:
dict: The modified search parameters.
Raises:
Exception: Captures and logs any exception that occurs during the modification of search parameters.
"""
try:
#log.debug("Initial search_params: %s", search_params)
facet_field = search_params.get('facet.field', '')
#log.debug("facet.field: %s", facet_field)

if not facet_field:
return search_params
elif isinstance(facet_field, list):
Expand All @@ -68,48 +82,179 @@ def before_search(self, search_params):
log.error("[before_search] Error: %s", e)
return search_params

# CKAN < 2.10
def after_search(self, search_results, search_params):
return self.after_dataset_search(search_results, search_params)

def after_dataset_search(self, search_results, search_params):
return search_results

# CKAN < 2.10
def before_index(self, data_dict):
"""Processes the data dictionary before indexing.
return self.before_dataset_index(data_dict)

def before_dataset_index(self, data_dict):
"""
Processes the data dictionary before dataset indexing.
Args:
data_dict (dict): The data dictionary to be processed.
Returns:
dict: The processed data dictionary.
"""
# Remove empty extras keys
data_dict = self.remove_empty_extras_keys(data_dict)

Iterates through each facet defined in the system's facets dictionary. For each facet present in the data dictionary, it attempts to parse its value as JSON. If the value is a valid JSON string, it replaces the original string value with the parsed JSON object. If the value cannot be parsed as JSON (e.g., because it's not a valid JSON string), it leaves the value unchanged. Facets present in the data dictionary but not containing any data are removed.
# Convert stringified lists to actual lists
data_dict = self.convert_stringified_lists(data_dict)

# Flatten repeating subfields
data_dict = self.flatten_repeating_subfields(data_dict)

log.debug('before_dataset_index data_dict: %s', data_dict)

return data_dict

def convert_stringified_lists(self, data_dict):
"""
Converts stringified lists in the data dictionary to actual lists.
Args:
data_dict (dict): The data dictionary to be processed. It's expected to contain keys corresponding to facet names with their associated data as values.
data_dict (dict): The data dictionary to be processed.
Returns:
dict: The processed data dictionary with actual lists.
This function iterates over the items in the data dictionary and converts
any stringified lists (strings that start with '[' and end with ']') into
actual lists. Keys that start with 'extras_', 'res_', or are 'validated_data_dict'
are excluded from this conversion.
"""
# Excluded items
excluded_keys = [
key for key in data_dict
if key.startswith('extras_') or key.startswith('res_') or key == 'validated_data_dict'
]

# Filter data dictionary
filter_data_dict = {
key: value for key, value in data_dict.items()
if key not in excluded_keys
}

for key, value in filter_data_dict.items():
if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
try:
data_dict[key] = ast.literal_eval(value)
except (ValueError, SyntaxError) as e:
log.error("Error converting stringified list for key '%s': %s", key, e)

return data_dict

def remove_empty_extras_keys(self, data_dict):
"""
Remove extra_* and res_extras_* keys that contain empty lists or lists of empty strings.
Args:
data_dict (dict): The data dictionary to be processed.
Returns:
dict: The processed data dictionary with JSON strings parsed into objects where applicable and empty facets removed.
dict: The processed data dictionary with empty extras keys removed.
"""
for facet, label in utils.get_facets_dict().items():
data = data_dict.get(facet)
#log.debug("[before_index] Data ({1}) in facet: {0}".format(data, facet))
if data:
if isinstance(data, str):
try:
data_dict[facet] = json.loads(data)
except json.decoder.JSONDecodeError:
data_dict[facet] = data
else:
if facet in data_dict:
del data_dict[facet]
keys_to_remove = []
for key, value in data_dict.items():
if (key.startswith('extras_') or key.startswith('res_extras_')) and isinstance(value, list):
if all(not item.strip() for item in value if isinstance(item, str)):
keys_to_remove.append(key)

log.debug('keys_to_remove: %s', keys_to_remove)
for key in keys_to_remove:
data_dict.pop(key, None)

return data_dict

def flatten_repeating_subfields(self, data_dict):
"""
Based on https://github.com/ckan/ckanext-scheming/pull/414
Notes:
Index suitable repeating dataset fields in before_dataset_index to prevent failures
on unmodified solr schema. This will allow hitting results in most text and list
subfields. Ideally you probably want to select the relevant subfields that will get
indexed and modify the Solr schema if necessary.
This implementation will group the values of the same subfields into an
`extras_{field_name}__{key}`,a text Solr field that will allow free-text search on
its value. Again, if you require more precise handling of a particular subfield,
you will need to customize the Solr schema to add particular fields needed.
Args:
data_dict (dict): The data dictionary to be processed.
Returns:
dict: The processed data dictionary with flattened repeating subfields.
"""
schemas = SchemingDatasetsPlugin.instance._expanded_schemas
if data_dict['type'] not in schemas:
return data_dict

schema = schemas[data_dict['type']]

for field in schema['dataset_fields']:
if field['field_name'] in data_dict and 'repeating_subfields' in field:
flattened_values = {}
for item in data_dict[field['field_name']]:
for key, value in item.items():
if isinstance(value, dict):
continue
if isinstance(value, list):
value = ' '.join(value)
new_key = 'extras_{field_name}__{key}'.format(
field_name=field["field_name"], key=key
)
if new_key not in flattened_values:
flattened_values[new_key] = value
else:
flattened_values[new_key] += ' ' + value

data_dict.update(flattened_values)
data_dict.pop(field['field_name'], None)

return data_dict

# CKAN < 2.10
def before_view(self, pkg_dict):
return self.before_dataset_view(pkg_dict)

def before_dataset_view(self, pkg_dict):
return pkg_dict

# CKAN < 2.10
def after_create(self, context, data_dict):
return self.after_dataset_create(context, data_dict)

def after_dataset_create(self, context, data_dict):
return data_dict

# CKAN < 2.10
def after_update(self, context, data_dict):
return self.after_dataset_update(context, data_dict)

def after_dataset_update(self, context, data_dict):
return data_dict

# CKAN < 2.10
def after_delete(self, context, data_dict):
return self.after_dataset_delete(context, data_dict)

def after_dataset_delete(self, context, data_dict):
return data_dict

# CKAN < 2.10
def after_show(self, context, data_dict):
return self.after_dataset_show(context, data_dict)

def after_dataset_show(self, context, data_dict):
return data_dict

def update_facet_titles(self, facet_titles):
Expand All @@ -132,9 +277,9 @@ def _facet_search_operator(self, fq, facet_field):
try:
facet_operator = self.default_facet_operator
# Determine the facet operator based on request parameters
if request.params.get(FACET_OPERATOR_PARAM_NAME) == 'OR':
if request.args.get(FACET_OPERATOR_PARAM_NAME) == 'OR':
facet_operator = 'OR'
elif request.params.get(FACET_OPERATOR_PARAM_NAME) == 'AND':
elif request.args.get(FACET_OPERATOR_PARAM_NAME) == 'AND':
facet_operator = 'AND'

if facet_operator == 'OR' and facet_field:
Expand Down
1 change: 1 addition & 0 deletions ckanext/schemingdcat/templates/home/snippets/search.html
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

<div class="module module-search module-narrow module-shallow box">
<form class="module-content search-form" method="get" action="{% url_for 'dataset.search' %}">
{{ h.csrf_input() if 'csrf_input' in h }}
<h3 class="heading-layout2">{{ heading_title }}</h3>
<div class="search-input form-group search-giant">
<input aria-label="{% block header_site_search_label %}{{ _('Search datasets') }}{% endblock %}" id="field-main-search" type="text" class="form-control" name="q" value="" autocomplete="off" placeholder="{% block search_placeholder %}{{ placeholder }}{% endblock %}" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ <h2 class="module-heading search_heading">
</h2>
{% endblock %}
{% block facet_operator_items %}
{% set facet_operator = request.params.get('_facet_operator', h.schemingdcat_default_facet_search_operator()) %}
{% set facet_operator = request.args.get('_facet_operator', h.schemingdcat_default_facet_search_operator()) %}
<nav>
<ul class="unstyled nav nav-simple nav-facet">
<li class="nav-item container__filter__type__search">
Expand Down
Loading

0 comments on commit c294495

Please sign in to comment.