Fix search and index to ensure proper functionality

- Fix before_dataset_index to use list instead of json lists stringify, remove empty extras keys and fix subfields. - Fix facet search and custom facets - Fix templates to include new helpers - Update search to be compatible with CKAN >2.10 - Fix facet search operators.
mjanez · Sep 10, 2024 · c294495 · c294495
1 parent 4b5c4b7
commit c294495
Show file tree

Hide file tree

Showing 5 changed files with 270 additions and 156 deletions.
diff --git a/ckanext/schemingdcat/faceted.py b/ckanext/schemingdcat/faceted.py
@@ -1,8 +1,13 @@
+import logging
+import json
+
 import ckan.plugins as plugins
 from ckan.common import request
+
 import ckanext.schemingdcat.config as sdct_config
-from ckanext.schemingdcat.utils import get_facets_dict
-import logging
+from ckanext.schemingdcat.helpers import schemingdcat_get_current_lang
+import ckanext.schemingdcat.utils as utils
+from ckanext.schemingdcat.utils import deprecated
 
 log = logging.getLogger(__name__)
 
@@ -32,18 +37,23 @@ def dataset_facets(self,
         else:
             return facets_dict
 
-    def _custom_facets(self,
-                       facets_dict,
-                       package_type):
-
-        lang_code = request.environ['CKAN_LANG']
-
+    def _custom_facets(self, facets_dict, package_type):
+        lang_code = schemingdcat_get_current_lang()
+
+        # Initialize cache dictionary if it does not exist
+        if not hasattr(sdct_config, 'dataset_custom_facets'):
+            sdct_config.dataset_custom_facets = {}
+
+        # Check if we already cached the results for the current language
+        if lang_code in sdct_config.dataset_custom_facets:
+            return sdct_config.dataset_custom_facets[lang_code]
+
         _facets_dict = {}
         for facet in self.facet_list:
             # Look for the field label in the scheming file.
             # If it's not there, use the default dictionary provided
-            scheming_item = get_facets_dict().get(facet)
-
+            scheming_item = utils.get_facets_dict().get(facet)
+    
             if scheming_item:
                 # Retrieve the corresponding label for the used language
                 _facets_dict[facet] = scheming_item.get(lang_code)
@@ -57,17 +67,16 @@ def _custom_facets(self,
                     else:
                         log.warning(
                             "Unable to find a valid label for the field '%s' when faceting" % facet)
-
+    
                 if not _facets_dict[facet]:
                     _facets_dict[facet] = plugins.toolkit._(facet)
-
+    
             else:
                 _facets_dict[facet] = plugins.toolkit._(facets_dict.get(facet))
-
-        # tag_key = 'tags_' + lang_code
-        # facets_dict[tag_key] = plugins.toolkit._('Tag')
-        #FIXME: FOR COMMON TAG FACET
-        #log.debug("dataset_facets._facets_dict: {0}".format(_facets_dict))
+
+        # Cache the results for the current language
+        sdct_config.dataset_custom_facets[lang_code] = _facets_dict
+
         return _facets_dict
 
     def group_facets(self,

diff --git a/ckanext/schemingdcat/package_controller.py b/ckanext/schemingdcat/package_controller.py
@@ -1,11 +1,16 @@
 from ckan.common import request
 import json
 import ckan.plugins as plugins
+
+from ckanext.scheming.plugins import (
+    SchemingDatasetsPlugin
+)
+
 import ckanext.schemingdcat.config as sdct_config
-import ckanext.schemingdcat.utils as utils
 
 import logging
 import sys
+import ast
 
 FACET_OPERATOR_PARAM_NAME = '_facet_operator'
 FACET_SORT_PARAM_NAME = '_%s_sort'
@@ -15,7 +20,7 @@
 
 class PackageController():
 
-    plugins.implements(plugins.IPackageController)
+    plugins.implements(plugins.IPackageController, inherit=True)
 
     default_facet_operator = sdct_config.default_facet_operator
 
@@ -37,22 +42,31 @@ def authz_remove_role(self, object_role):
     def delete(self, entity):
         pass
 
+
+    # CKAN < 2.10
     def before_search(self, search_params):
-        """Modifies search parameters before executing a search.
+        return self.before_dataset_search(search_params)
 
+    # CKAN >= 2.10
+    def before_dataset_search(self, search_params):
+        """Modifies search parameters before executing a search.
+    
         This method adjusts the 'fq' (filter query) parameter based on the 'facet.field' value in the search parameters. If 'facet.field' is a list, it iterates through each field, applying the '_facet_search_operator' to modify 'fq'. If 'facet.field' is a string, it directly applies the '_facet_search_operator'. If 'facet.field' is not present or is invalid, no modification is made.
-
+    
         Args:
             search_params (dict): The search parameters to be modified. Expected to contain 'facet.field' and 'fq'.
-
+    
         Returns:
             dict: The modified search parameters.
-
+    
         Raises:
             Exception: Captures and logs any exception that occurs during the modification of search parameters.
         """
         try:
+            #log.debug("Initial search_params: %s", search_params)
             facet_field = search_params.get('facet.field', '')
+            #log.debug("facet.field: %s", facet_field)
+
             if not facet_field:
                 return search_params
             elif isinstance(facet_field, list):
@@ -68,48 +82,179 @@ def before_search(self, search_params):
             log.error("[before_search] Error: %s", e)
         return search_params
 
+    # CKAN < 2.10
     def after_search(self, search_results, search_params):
+        return self.after_dataset_search(search_results, search_params)
+
+    def after_dataset_search(self, search_results, search_params):
         return search_results
 
+    # CKAN < 2.10
     def before_index(self, data_dict):
-        """Processes the data dictionary before indexing.
+        return self.before_dataset_index(data_dict)
+
+    def before_dataset_index(self, data_dict):
+        """
+        Processes the data dictionary before dataset indexing.
+    
+        Args:
+            data_dict (dict): The data dictionary to be processed.
+    
+        Returns:
+            dict: The processed data dictionary.
+        """
+        # Remove empty extras keys
+        data_dict = self.remove_empty_extras_keys(data_dict)
 
-        Iterates through each facet defined in the system's facets dictionary. For each facet present in the data dictionary, it attempts to parse its value as JSON. If the value is a valid JSON string, it replaces the original string value with the parsed JSON object. If the value cannot be parsed as JSON (e.g., because it's not a valid JSON string), it leaves the value unchanged. Facets present in the data dictionary but not containing any data are removed.
+        # Convert stringified lists to actual lists
+        data_dict = self.convert_stringified_lists(data_dict)
 
+        # Flatten repeating subfields
+        data_dict = self.flatten_repeating_subfields(data_dict)
+
+        log.debug('before_dataset_index data_dict: %s', data_dict)
+
+        return data_dict
+
+    def convert_stringified_lists(self, data_dict):
+        """
+        Converts stringified lists in the data dictionary to actual lists.
+    
         Args:
-            data_dict (dict): The data dictionary to be processed. It's expected to contain keys corresponding to facet names with their associated data as values.
+            data_dict (dict): The data dictionary to be processed.
+    
+        Returns:
+            dict: The processed data dictionary with actual lists.
+    
+        This function iterates over the items in the data dictionary and converts
+        any stringified lists (strings that start with '[' and end with ']') into
+        actual lists. Keys that start with 'extras_', 'res_', or are 'validated_data_dict'
+        are excluded from this conversion.
+        """
+        # Excluded items
+        excluded_keys = [
+            key for key in data_dict 
+            if key.startswith('extras_') or key.startswith('res_') or key == 'validated_data_dict'
+        ]
+
+        # Filter data dictionary
+        filter_data_dict = {
+            key: value for key, value in data_dict.items()
+            if key not in excluded_keys
+        }
+
+        for key, value in filter_data_dict.items():
+            if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
+                try:
+                    data_dict[key] = ast.literal_eval(value)
+                except (ValueError, SyntaxError) as e:
+                    log.error("Error converting stringified list for key '%s': %s", key, e)
+
+        return data_dict
+
+    def remove_empty_extras_keys(self, data_dict):
+        """
+        Remove extra_* and res_extras_* keys that contain empty lists or lists of empty strings.
+
+        Args:
+            data_dict (dict): The data dictionary to be processed.
 
         Returns:
-            dict: The processed data dictionary with JSON strings parsed into objects where applicable and empty facets removed.
+            dict: The processed data dictionary with empty extras keys removed.
         """
-        for facet, label in utils.get_facets_dict().items():
-            data = data_dict.get(facet)
-            #log.debug("[before_index] Data ({1}) in facet: {0}".format(data, facet))
-            if data:
-                if isinstance(data, str):
-                    try:
-                        data_dict[facet] = json.loads(data)
-                    except json.decoder.JSONDecodeError:
-                        data_dict[facet] = data
-            else:
-                if facet in data_dict:
-                    del data_dict[facet]
+        keys_to_remove = []
+        for key, value in data_dict.items():
+            if (key.startswith('extras_') or key.startswith('res_extras_')) and isinstance(value, list):
+                if all(not item.strip() for item in value if isinstance(item, str)):
+                    keys_to_remove.append(key)
+
+        log.debug('keys_to_remove: %s', keys_to_remove)
+        for key in keys_to_remove:
+            data_dict.pop(key, None)
 
         return data_dict
+
+    def flatten_repeating_subfields(self, data_dict):
+        """
+        Based on https://github.com/ckan/ckanext-scheming/pull/414
+        
+        Notes:
+            Index suitable repeating dataset fields in before_dataset_index to prevent failures
+            on unmodified solr schema. This will allow hitting results in most text and list
+            subfields. Ideally you probably want to select the relevant subfields that will get
+            indexed and modify the Solr schema if necessary.
+            This implementation will group the values of the same subfields into an
+            `extras_{field_name}__{key}`,a text Solr field that will allow free-text search on
+            its value. Again, if you require more precise handling of a particular subfield,
+            you will need to customize the Solr schema to add particular fields needed.
+    
+        Args:
+            data_dict (dict): The data dictionary to be processed.
+    
+        Returns:
+            dict: The processed data dictionary with flattened repeating subfields.
+        """
+        schemas = SchemingDatasetsPlugin.instance._expanded_schemas
+        if data_dict['type'] not in schemas:
+            return data_dict
+
+        schema = schemas[data_dict['type']]
+
+        for field in schema['dataset_fields']:
+            if field['field_name'] in data_dict and 'repeating_subfields' in field:
+                flattened_values = {}
+                for item in data_dict[field['field_name']]:
+                    for key, value in item.items():
+                        if isinstance(value, dict):
+                            continue
+                        if isinstance(value, list):
+                            value = ' '.join(value)
+                        new_key = 'extras_{field_name}__{key}'.format(
+                            field_name=field["field_name"], key=key
+                        )
+                        if new_key not in flattened_values:
+                            flattened_values[new_key] = value
+                        else:
+                            flattened_values[new_key] += ' ' + value
+
+                data_dict.update(flattened_values)
+                data_dict.pop(field['field_name'], None)
+
+        return data_dict
 
+    # CKAN < 2.10
     def before_view(self, pkg_dict):
+        return self.before_dataset_view(pkg_dict)
+
+    def before_dataset_view(self, pkg_dict):
         return pkg_dict
 
+    # CKAN < 2.10
     def after_create(self, context, data_dict):
+        return self.after_dataset_create(context, data_dict)
+
+    def after_dataset_create(self, context, data_dict):
         return data_dict
 
+    # CKAN < 2.10
     def after_update(self, context, data_dict):
+        return self.after_dataset_update(context, data_dict)
+
+    def after_dataset_update(self, context, data_dict):
         return data_dict
 
+    # CKAN < 2.10
     def after_delete(self, context, data_dict):
+        return self.after_dataset_delete(context, data_dict)
+
+    def after_dataset_delete(self, context, data_dict):
         return data_dict
 
+    # CKAN < 2.10
     def after_show(self, context, data_dict):
+        return self.after_dataset_show(context, data_dict)
+
+    def after_dataset_show(self, context, data_dict):
         return data_dict
 
     def update_facet_titles(self, facet_titles):
@@ -132,9 +277,9 @@ def _facet_search_operator(self, fq, facet_field):
         try:
             facet_operator = self.default_facet_operator
             # Determine the facet operator based on request parameters
-            if request.params.get(FACET_OPERATOR_PARAM_NAME) == 'OR':
+            if request.args.get(FACET_OPERATOR_PARAM_NAME) == 'OR':
                 facet_operator = 'OR'
-            elif request.params.get(FACET_OPERATOR_PARAM_NAME) == 'AND':
+            elif request.args.get(FACET_OPERATOR_PARAM_NAME) == 'AND':
                 facet_operator = 'AND'
 
             if facet_operator == 'OR' and facet_field:

diff --git a/ckanext/schemingdcat/templates/home/snippets/search.html b/ckanext/schemingdcat/templates/home/snippets/search.html
@@ -7,6 +7,7 @@
 
 <div class="module module-search module-narrow module-shallow box">
   <form class="module-content search-form" method="get" action="{% url_for 'dataset.search' %}">
+    {{ h.csrf_input() if 'csrf_input' in h }}
     <h3 class="heading-layout2">{{ heading_title }}</h3>
     <div class="search-input form-group search-giant">
       <input aria-label="{% block header_site_search_label %}{{ _('Search datasets') }}{% endblock %}" id="field-main-search" type="text" class="form-control" name="q" value="" autocomplete="off" placeholder="{% block search_placeholder %}{{ placeholder }}{% endblock %}" />

diff --git a/ckanext/schemingdcat/templates/schemingdcat/snippets/facet_search_operator.html b/ckanext/schemingdcat/templates/schemingdcat/snippets/facet_search_operator.html
@@ -44,7 +44,7 @@ <h2 class="module-heading search_heading">
         </h2>
         {% endblock %}
         {% block facet_operator_items %}
-        {% set facet_operator = request.params.get('_facet_operator',  h.schemingdcat_default_facet_search_operator()) %}
+        {% set facet_operator = request.args.get('_facet_operator',  h.schemingdcat_default_facet_search_operator()) %}
         <nav>
           <ul class="unstyled nav nav-simple nav-facet">
           <li class="nav-item container__filter__type__search">