Skip to content

Commit

Permalink
Merge pull request #2 from impresso/for-impresso2
Browse files Browse the repository at this point in the history
For impresso2
  • Loading branch information
simon-clematide authored Apr 13, 2024
2 parents b9b5924 + 32c7191 commit 7fb8df4
Show file tree
Hide file tree
Showing 8 changed files with 642 additions and 31 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,7 @@
/.venv/
Makefile.local.mk
*.d/
.env
*.log
out.txt
testbuild/
30 changes: 24 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
##########################################################################################
# Make setup

SHELL := .venv/bin/pipenv run /bin/bash -u
SHELL:=/bin/bash

export SHELLOPTS := errexit:pipefail
.SECONDARY:

Expand All @@ -24,13 +25,20 @@ BUILD_DIR ?= build.d
REBUILT_DIR ?= /srv/scratch2/climpresso/s3data/canonical-rebuilt-release


S3_BUCKET_LINGPROC_PATH ?= 42-processed-data-final/lingproc

S3_LINGPROC_VERSION ?= v2024.04.04

# used for debugging variables from the make process
include lib/debug.mk

help:
# Available targets:
@echo " impresso-linguistic-processing-target # Process all impresso rebuilt files."
@echo " update-requirements # Update the requirements.txt file with the current pipenv requirements."
@echo " test-txt # Test the linguistic preprocessing output."
@echo "Usage: make <target>"
@echo "Targets:"
@echo " impresso-linguistic-processing-target # Process all impresso rebuilt files."
@echo " update-requirements # Update the requirements.txt file with the current pipenv requirements."
@echo " help # Show this help message"

.DEFAULT_GOAL := help
PHONY_TARGETS += help

Expand Down Expand Up @@ -63,14 +71,24 @@ impresso-linguistic-processing-target : $(impresso-linguistic-processing-files)

$(BUILD_DIR)/%.jsonl.bz2: $(IMPRESSO_REBUILT_DATA_DIR)/%.jsonl.bz2 $(IMPRESSO_LANGIDENT_DATA_DIR)/%.jsonl.bz2
mkdir -p $(@D) &&\
python3 $(LIB)/spacy_linguistic_preprocessing.py \
python3 $(LIB)/spacy_linguistic_processing.py \
$< \
--lid $(word 2,$^) \
--validate \
-o $@ \
2> $@.log \
|| rm -f $@



#: Actually upload the impresso linguistic information to s3 impresso bucket
upload-release-to-s3: impresso-linguistic-processing-target
rclone --verbose copy $(BUILD_DIR)/ s3-impresso:$(S3_BUCKET_LINGPROC_PATH)/$(S3_LINGPROC_VERSION) --include "*.jsonl.bz2" --ignore-existing \


# && rclone --verbose check $(BUILD_DIR)/$(LID_S3_LINGPROC_VERSIONVERSION)/ s3-impresso:$(S3_BUCKET_LINGPROC_PATH)/$(LID_VERSION)/


update-requirements:
pipenv requirements > requirements.txt

Expand Down
9 changes: 9 additions & 0 deletions Makefile.local.mk.sample
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,12 @@
IMPRESSO_REBUILT_DATA_DIR ?= test/rebuilt-data
IMPRESSO_LANGIDENT_DATA_DIR ?= test/langident
BUILD_DIR ?= testbuild.d

test-s3: Makefile
mkdir -p testbuild.d/ && \
python3 lib/spacy_linguistic_preprocessing.py \
s3://22-rebuilt-final/actionfem/actionfem-1928.jsonl.bz2 \
--lid s3://42-processed-data-final/langident/v1.4.4/actionfem/actionfem-1928.jsonl.bz2 \
-o testbuild.d/actionfem-1928.jsonl.bz2 \
--validate

12 changes: 10 additions & 2 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,24 @@ url = "https://pypi.org/simple"
verify_ssl = true

[dev-packages]
impresso-commons ="*"

[packages]
smart-open = "*"
smart-open= {extras = ["s3"], version = "==6.4"}
spacy = "==3.6"
python-dotenv = "*"
spacy-lookups-data = "*"
de-core-news-md = {file = "https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.6.0/de_core_news_md-3.6.0.tar.gz"}
# https://github.com/explosion/spacy-models/releases/tag/fr_core_news_md-3.6.0
fr-core-news-md = {file = "https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.6.0/fr_core_news_md-3.6.0.tar.gz"}
en-core-web-md = {file = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0.tar.gz"}
# luxembourgish model is in repository and comes from https://github.com/PeterGilles/Luxembourgish-language-resources

boto3 = "*"
jsonschema = "*"
# pin it to a version 1 (otherwise it will install version 2)
numpy = "==1.26.*"
[requires]
python_version = "3.11"

[pipenv]
allow_prereleases = true
217 changes: 208 additions & 9 deletions Pipfile.lock

Large diffs are not rendered by default.

34 changes: 24 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
# Information on impresso linguistic preprocessing

This repository implements the following linguistic processing steps:
- POS tagging
- NER tagging
- improved lemmatization

- POS tagging
- NER tagging
- improved lemmatization

We do this for the following languages:
- fr
- de
- lb (only POS tagging)
- en

- fr
- de
- lb (only POS tagging)
- en

## Prerequisites

The build process has been tested on modern Linux and macOS systems and requires
Python 3.11. Under Debian, make sure to have the following packages installed:

Expand All @@ -32,10 +36,20 @@ $ python3.11 -mpipenv shell
```

# Running the pipeline
Adapt the local paths for the input and output directories in the `Makefile` and run the following command:

Adapt the local paths for the input and output directories according in the
`Makefile.local.mk` (see `Makefile.local.mk.sample` for an example).
and run the following command:

```sh
make impresso-linguistic-processing-target -j N
```

# Uploading to impresso S3
@TODO
# Uploading to impresso S3 bucket

Ensure that the environment variables SE_ACCESS_KEY and SE_SECRET_KEY for access to the
s3 impresso infrastructure are set, e.g. by setting them in a local .env files.

```sh
make upload-to-s3
```
Loading

0 comments on commit 7fb8df4

Please sign in to comment.