Skip to content

Commit

Permalink
Merge pull request #397 from GSA/feature/solrcloud
Browse files Browse the repository at this point in the history
Solrcloud Integration with Solr 8
  • Loading branch information
jbrown-xentity authored Jan 5, 2022
2 parents 5a7bf6c + 5a0da08 commit e1a38a9
Show file tree
Hide file tree
Showing 21 changed files with 1,535 additions and 776 deletions.
6 changes: 5 additions & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ TEST_CKAN_DATASTORE_READ_URL=postgresql://datastore_ro:datastore@db/datastore_te
CKAN_SOLR_URL=http://solr:8983/solr/ckan
CKAN_REDIS_URL=redis://redis:6379/1

CKAN_SOLR_BASE_URL=http://solr:8983
CKAN_SOLR_USER=admin
CKAN_SOLR_PASSWORD=pass

TEST_CKAN_SOLR_URL=http://solr:8983/solr/ckan
TEST_CKAN_REDIS_URL=redis://redis:6379/1

Expand Down Expand Up @@ -142,4 +146,4 @@ CKANEXT__SAML2AUTH__REQUESTED_AUTHN_CONTEXT=http://idmanagement.gov/ns/assurance
CKANEXT__SAML2AUTH__REQUESTED_AUTHN_CONTEXT_COMPARISON=exact

# Avoid double package_show call to add tracking info
CKANEXT__DATAGOVCATALOG__ADD_PACKAGES_TRACKING_INFO=false
CKANEXT__DATAGOVCATALOG__ADD_PACKAGES_TRACKING_INFO=false
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ e2e/cypress/results/output.xml
node_modules
package-lock.json

solr/*.zip
ckan/setup/solr/*.zip
ckan/setup/solr/managed-schema
2 changes: 1 addition & 1 deletion .profile
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ export CKANEXT__SAML2AUTH__SYSADMINS_LIST=$(echo $VCAP_SERVICES | jq --raw-outpu
# Set up the collection in Solr
echo Setting up Solr collection
export SOLR_COLLECTION=ckan
./solr/migrate-solrcloud-schema.sh $SOLR_COLLECTION
./ckan/setup/migrate-solrcloud-schema.sh $SOLR_COLLECTION
export CKAN_SOLR_URL=$CKAN_SOLR_BASE_URL/solr/$SOLR_COLLECTION

# Write out any files and directories
Expand Down
7 changes: 5 additions & 2 deletions ckan/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ FROM openknowledge/ckan-dev:2.9
ENV GIT_BRANCH=2.9

# add dependencies for cryptography and vim
RUN apk add libressl-dev musl-dev libffi-dev xmlsec vim xmlsec-dev openjdk11
RUN apk add libressl-dev musl-dev libffi-dev xmlsec vim xmlsec-dev openjdk11 zip
# Download Saxon jar for FGDC2ISO transform (geodatagov)
ARG saxon_ver=9.9.1-7
ADD \
Expand All @@ -26,6 +26,9 @@ COPY docker-entrypoint.d/* /docker-entrypoint.d/
COPY setup/gunicorn.conf.py ${APP_DIR}/
COPY setup/server_start.sh ${APP_DIR}/

# Custom prerun script for Solr 8
COPY setup/GSA_prerun.py ${APP_DIR}/

COPY saml2 ${APP_DIR}/saml2

# COPY the ini test file to the container
Expand All @@ -39,4 +42,4 @@ RUN ln -s /usr/bin/python3 /usr/bin/python
# harvests, we need to setup a cron for the run command
COPY setup/harvest-check-cron /etc/crontabs/root

# RUN sudo -u ckan -EH pip3 install git+https://github.com/nickumia-reisys/werkzeug@e1f6527604ab30e4b46b5430a5fb97e7a7055cd7#egg=werkzeug
# RUN sudo -u ckan -EH pip3 install git+https://github.com/nickumia-reisys/werkzeug@e1f6527604ab30e4b46b5430a5fb97e7a7055cd7#egg=werkzeug
2 changes: 1 addition & 1 deletion ckan/requirements.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

# CKAN requirements and extensions
ckan @ git+https://github.com/ckan/ckan.git@ckan-2.9.4
ckan @ git+https://github.com/ckan/ckan.git@dev-v2.9
-e git+https://github.com/GSA/ckanext-datagovcatalog.git@main#egg=ckanext-datagovcatalog
-e git+https://github.com/GSA/ckanext-datagovtheme.git@main#egg=ckanext-datagovtheme
-e git+https://github.com/GSA/ckanext-datajson.git@main#egg=ckanext-datajson
Expand Down
24 changes: 12 additions & 12 deletions ckan/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@ boto==2.49.0
certifi==2021.10.8
cffi==1.15.0
chardet==3.0.4
ckan @ git+https://github.com/ckan/ckan.git@6731c5a821a6a5f4bdaa20f4e793e0b6ba44f823
ckan @ git+https://github.com/ckan/ckan.git@ef1432fa4177edbd0a2f457bad363a33b6b61344
-e git+https://github.com/GSA/ckanext-datagovcatalog.git@64e65702ae1eb5e46d9f37139dc4044c0f253526#egg=ckanext_datagovcatalog
-e git+https://github.com/GSA/ckanext-datagovtheme.git@fa53b7c836d4cc3f34b0c5f156b04f9bacb8a67e#egg=ckanext_datagovtheme
-e git+https://github.com/GSA/ckanext-datagovtheme.git@d20b0f696cc16251dc94ee811f3f36cbde6673dc#egg=ckanext_datagovtheme
-e git+https://github.com/GSA/ckanext-datajson.git@a5d8c9458a7efe955c31b90eeac1e7797881d014#egg=ckanext_datajson
ckanext-dcat @ git+https://github.com/ckan/ckanext-dcat@2d2c8a894bea8c97b0c8544465094f9979ac516b
ckanext-envvars @ git+https://github.com/GSA/ckanext-envvars.git@33f7e190ab332244cb961a425e09af592d9b647b
-e git+https://github.com/GSA/ckanext-geodatagov.git@af6378074fcbc2705e7e33960d5ddd2c8e46ed4c#egg=ckanext_geodatagov
ckanext-googleanalyticsbasic @ git+https://github.com/GSA/ckanext-googleanalyticsbasic.git@c6a425d5e14d658c0fa3661fdc4423162161c3f4
-e git+https://github.com/ckan/ckanext-harvest.git@9d5679f0461f5aac05b7f800e11b6a62afb7feeb#egg=ckanext_harvest
-e git+https://github.com/ckan/ckanext-harvest.git@d84d847b09f28ab97bf1ca0baa651fdc05693d03#egg=ckanext_harvest
ckanext-saml2auth @ git+https://github.com/keitaroinc/ckanext-saml2auth.git@7412ff7aba3d215f95a08f99216410e72e60c5bc
-e git+https://github.com/gsa/ckanext-spatial.git@3828c6e7efe7c4b5cef02f4e7163339c7b5c5710#egg=ckanext_spatial
ckantoolkit==0.0.3
click==7.1.2
cryptography==35.0.0
cryptography==36.0.1
defusedxml==0.7.1
Deprecated==1.2.13
distro==1.6.0
Expand All @@ -33,28 +33,28 @@ flask-multistatic==1.0
future==0.18.2
GeoAlchemy2==0.5.0
geomet==0.3.0
gevent==21.8.0
gevent==21.12.0
google-compute-engine==2.8.13
greenlet==1.1.2
gunicorn==20.1.0
html5lib==1.1
idna==2.10
importlib-resources==5.4.0
isodate==0.6.0
isodate==0.6.1
itsdangerous==2.0.1
Jinja2==2.11.3
json-table-schema==0.2.1
jsonschema==2.4.0
LEPL==5.1.3
lxml==4.6.4
Mako==1.1.5
lxml==4.7.1
Mako==1.1.6
Markdown==3.1.1
MarkupSafe==2.0.1
messytables==0.15.2
newrelic==7.2.4.171
nose==1.3.7
OWSLib==0.18.0
packaging==21.2
packaging==21.3
passlib==1.7.3
PasteDeploy==2.0.1
pathtools==0.1.2
Expand All @@ -65,7 +65,7 @@ psycopg2==2.8.6
pycparser==2.21
PyJWT==1.7.1
pyOpenSSL==20.0.1
pyparsing==2.4.7
pyparsing==3.0.6
pyproj==2.6.1
pysaml2==7.0.1
pysolr==3.6.0
Expand All @@ -78,7 +78,7 @@ PyUtilib==5.7.1
PyYAML==5.4
PyZ3950 @ git+https://github.com/danizen/PyZ3950@6d44a4ab85c8bda3a7542c2c9efdfad46c830219
rdflib==4.2.2
redis==4.0.0
redis==4.0.2
repoze.lru==0.7
repoze.who==2.3
requests==2.25.0
Expand All @@ -100,7 +100,7 @@ WebOb==1.8.7
Werkzeug==1.0.0
wrapt==1.13.3
xlrd==2.0.1
xmlschema==1.8.2
xmlschema==1.9.1
zipp==3.6.0
zope.event==4.5.0
zope.interface==5.4.0
54 changes: 54 additions & 0 deletions ckan/setup/GSA_prerun.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import sys
import time
try:
from urllib.request import urlopen
from urllib.error import URLError
except ImportError:
from urllib2 import urlopen
from urllib2 import URLError

import prerun as pr

RETRY = 5


def check_solr_connection(retry=None):
if retry is None:
retry = RETRY
elif retry == 0:
print("[prerun] Giving up after 5 tries...")
sys.exit(1)

url = os.environ.get("CKAN_SOLR_URL", "")
search_url = "{url}/select/?q=*&wt=json".format(url=url)

try:
connection = urlopen(search_url)
except URLError as e:
print(str(e))
print("[prerun] Unable to connect to solr, waiting...")
time.sleep(10)
check_solr_connection(retry=retry - 1)
else:
try:
pythonified = str(connection.read()).replace('true', 'True')
eval(pythonified)
except TypeError:
pass


if __name__ == "__main__":

maintenance = os.environ.get("MAINTENANCE_MODE", "").lower() == "true"

if maintenance:
print("[prerun] Maintenance mode, skipping setup...")
else:
pr.check_main_db_connection()
pr.init_db()
pr.update_plugins()
pr.check_datastore_db_connection()
pr.init_datastore_db()
check_solr_connection()
pr.create_sysadmin()
19 changes: 17 additions & 2 deletions ckan/setup/ckan_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,23 @@ ckan config-tool $SRC_DIR/ckan/test-core.ini \
"solr_url = $TEST_CKAN_SOLR_URL" \
"ckan.redis.url = $TEST_CKAN_REDIS_URL"

# SOLR takes a while to boot up in zookeeper mode, make sure it's up before
echo "Validating SOLR is up..."
NEXT_WAIT_TIME=0
until [ $NEXT_WAIT_TIME -eq 10 ] || curl --get --fail --quiet --location-trusted --user $CKAN_SOLR_USER:$CKAN_SOLR_PASSWORD \
$CKAN_SOLR_BASE_URL/solr/admin/collections \
--data-urlencode action=list \
--data-urlencode wt=json; do
sleep $(( NEXT_WAIT_TIME++ ))
echo "SOLR still not up, trying for the $NEXT_WAIT_TIME time"
done
[ $NEXT_WAIT_TIME -lt 10 ]

# Add ckan core to solr
/app/ckan/setup/migrate-solrcloud-schema.sh

# Run the prerun script to init CKAN and create the default admin user
python3 prerun.py
python GSA_prerun.py

# Run any startup scripts provided by images extending this one
if [[ -d "/docker-entrypoint.d" ]]
Expand All @@ -66,4 +81,4 @@ then
done
fi

exec /app/ckan/setup/server_start.sh
exec /app/ckan/setup/server_start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,18 @@ if ! (curl --get --fail --location-trusted --user $CKAN_SOLR_USER:$CKAN_SOLR_PA
--data-urlencode action=list \
--data-urlencode wt=json | grep -q $COLLECTION_NAME); then

cd $(dirname $0)/solr

CKAN_BRANCH="dev-v2.9"
curl https://raw.githubusercontent.com/ckan/ckan/$CKAN_BRANCH/ckan/config/solr/schema.xml -o managed-schema

# Fix from https://github.com/ckan/ckan/issues/5585#issuecomment-953586246
sed -i "s/<defaultSearchField>text<\/defaultSearchField>/<df>text<\/df>/" managed-schema
sed -i "s/<solrQueryParser defaultOperator=\"AND\"\/>/<solrQueryParser q.op=\"AND\"\/>/" managed-schema

# Zip solr configSet
cd solr && zip ckan_2.9_solr_config.zip \
currency.xml elevate.xml protwords.txt schema.xml solrconfig.xml stopwords.txt synonyms.txt
zip ckan_2.9_solr_config.zip \
managed-schema solrconfig.xml protwords.txt stopwords.txt synonyms.txt

echo "Uploading config set..."
curl --fail --location-trusted --user $CKAN_SOLR_USER:$CKAN_SOLR_PASSWORD \
Expand All @@ -36,5 +45,6 @@ if ! (curl --get --fail --location-trusted --user $CKAN_SOLR_USER:$CKAN_SOLR_PA
curl --fail --location-trusted --user $CKAN_SOLR_USER:$CKAN_SOLR_PASSWORD \
"$CKAN_SOLR_BASE_URL/solr/admin/collections?action=create&name=$COLLECTION_NAME&collection.configName=$COLLECTION_NAME&numShards=1" \
-X POST

cd -
fi
42 changes: 21 additions & 21 deletions solr/protwords.txt → ckan/setup/solr/protwords.txt
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#-----------------------------------------------------------------------
# Use a protected word file to protect against the stemmer reducing two
# unrelated words to the same base word.

# Some non-words that normally won't be encountered,
# just to test that they won't be stemmed.
dontstems
zwhacky

# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-----------------------------------------------------------------------
# Use a protected word file to protect against the stemmer reducing two
# unrelated words to the same base word.
# Some non-words that normally won't be encountered,
# just to test that they won't be stemmed.
dontstems
zwhacky
Loading

0 comments on commit e1a38a9

Please sign in to comment.