Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configure Csv delimiter #716

Merged
merged 7 commits into from
Mar 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions datasets/songs_custom_delimiter.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
id;title;album;artist;genre;country;released;duration;released-timestamp;duration-float
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could reduce this csv file to a few lines :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes! But thank @sanders41, the tests have been missing from the package build since #367. In case you wonder

702481615;Armatage Shanks;Dookie: The Ultimate Critical Review;Green Day;Rock;Europe;2005;;1104537600;
888221515;Old Folks;Six Classic Albums Plus Bonus Tracks;Harold Land;Jazz;Europe;2013;6:36;1356998400;6.36
1382413601;คำขอร้อง;สำเนียงคนจันทร์ / เอาเถอะถ้าเห็นเขาดีกว่า;อิทธิพล บำรุงกุล;"Folk; World; & Country";Thailand;;;;
190889300;Track 1;Summer Breeze;Dreas;Funk / Soul;US;2008;18:56;1199145600;18.56
813645611;Slave (Alternative Version);Honky Château;Elton John;Rock;Europe;;2:53;;2.5300000000000002
394018506;Sex & Geld;Trackz Für Den Index;Mafia Clikk;Hip Hop;Germany;2006;5:02;1136073600;5.02
1522481803;Pisciaunella;Don Pepp U Pacce;Giovanni Russo (2);"Folk; World; & Country";Italy;1980;;315532800;
862296713;不知;Kiss 2001 Hong Kong Live Concert;Various;Electronic;Hong Kong;2002-04-13;;1018656000;
467946423;Rot;Be Quick Or Be Dead Vol. 3;Various;Electronic;Serbia;2013-06-20;1:00;1371686400;1
1323854803;"Simulation Project 1; ツキハナ「Moonflower」";Unlimited Dream Company;Amun Dragoon;Electronic;US;2018-04-10;2:44;1523318400;2.44
235115704;Doctor Vine;The Big F;The Big F;Rock;US;1989;5:29;599616000;5.29
249025232;"Ringel; Ringel; Reihe";Kinderlieder ABC - Der Bielefelder Kinderchor Singt 42 Lieder Von A-Z;Der Bielefelder Kinderchor;Children's;Germany;1971;;31536000;
710094000;Happy Safari = Safari Feliz;Safari Swings Again = El Safari Sigue En Su Swing;Bert Kaempfert & His Orchestra;Jazz;Argentina;1977;2:45;220924800;2.45
538632700;Take Me Up;Spring;Various;Electronic;US;2000;3:06;946684800;3.06
1556505508;Doin To Me ( Radio Version );Say My Name;Netta Dogg;Hip Hop;US;2005;;1104537600;
1067031900;Concerto For Balloon & Orchestra / Concerto For Synthesizer & Orchestra;Concerto For Balloon & Orchestra And Three Overtures;Stanyan String & Wind Ensemble;Classical;US;1977;;220924800;
137251914;"I Love The Nightlife (Disco 'Round) (Real Rapino 7"" Mix)";The Adventures Of Priscilla: Queen Of The Desert - Original Motion Picture Soundtrack;Various;Stage & Screen;US;1994;3:31;757382400;3.31
554983904;Walking On The Moon;Certifiable (Live In Buenos Aires);The Police;Rock;Malaysia;2008-11-00;;1225497600;
557616002;Two Soldiers;Jerry Garcia / David Grisman;David Grisman;"Folk; World; & Country";US;2014-04-00;4:24;1396310400;4.24
878936809;When You Gonna Learn;Live At Firenze 93;Jamiroquai;Funk / Soul;France;2004;13:01;1072915200;13.01
37 changes: 28 additions & 9 deletions meilisearch/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,7 @@ def add_documents_csv(
self,
str_documents: str,
primary_key: Optional[str] = None,
csv_delimiter: Optional[str] = None,
) -> TaskInfo:
"""Add string documents from a CSV file to the index.

Expand All @@ -443,6 +444,8 @@ def add_documents_csv(
String of document from a CSV file.
primary_key (optional):
The primary-key used in index. Ignored if already set up.
csv_delimiter:
One ASCII character used to customize the delimiter for CSV. Comma used by default.

Returns
-------
Expand All @@ -455,7 +458,7 @@ def add_documents_csv(
MeiliSearchApiError
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://docs.meilisearch.com/errors/#meilisearch-errors
"""
return self.add_documents_raw(str_documents, primary_key, "text/csv")
return self.add_documents_raw(str_documents, primary_key, "text/csv", csv_delimiter)

def add_documents_ndjson(
self,
Expand Down Expand Up @@ -489,6 +492,7 @@ def add_documents_raw(
str_documents: str,
primary_key: Optional[str] = None,
content_type: Optional[str] = None,
csv_delimiter: Optional[str] = None,
alallema marked this conversation as resolved.
Show resolved Hide resolved
) -> TaskInfo:
"""Add string documents to the index.

Expand All @@ -499,7 +503,10 @@ def add_documents_raw(
primary_key (optional):
The primary-key used in index. Ignored if already set up.
type:
The type of document. Type available: 'csv', 'json', 'jsonl'
The type of document. Type available: 'csv', 'json', 'jsonl'.
csv_delimiter:
One ASCII character used to customize the delimiter for CSV.
Note: The csv delimiter can only be used with the Content-Type text/csv.

Returns
-------
Expand All @@ -512,7 +519,7 @@ def add_documents_raw(
MeiliSearchApiError
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://docs.meilisearch.com/errors/#meilisearch-errors
"""
url = self._build_url(primary_key)
alallema marked this conversation as resolved.
Show resolved Hide resolved
url = self._build_url(primary_key=primary_key, csv_delimiter=csv_delimiter)
response = self.http.post(url, str_documents, content_type)
return TaskInfo(**response)

Expand Down Expand Up @@ -601,6 +608,7 @@ def update_documents_csv(
self,
str_documents: str,
primary_key: Optional[str] = None,
csv_delimiter: Optional[str] = None,
) -> TaskInfo:
"""Update documents as a csv string in the index.

Expand All @@ -609,7 +617,9 @@ def update_documents_csv(
str_documents:
String of document from a CSV file.
primary_key (optional):
The primary-key used in index. Ignored if already set up
The primary-key used in index. Ignored if already set up.
csv_delimiter:
One ASCII character used to customize the delimiter for CSV. Comma used by default.

Returns
-------
Expand All @@ -622,13 +632,14 @@ def update_documents_csv(
MeiliSearchApiError
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://docs.meilisearch.com/errors/#meilisearch-errors
"""
return self.update_documents_raw(str_documents, primary_key, "text/csv")
return self.update_documents_raw(str_documents, primary_key, "text/csv", csv_delimiter)

def update_documents_raw(
self,
str_documents: str,
primary_key: Optional[str] = None,
content_type: Optional[str] = None,
csv_delimiter: Optional[str] = None,
alallema marked this conversation as resolved.
Show resolved Hide resolved
) -> TaskInfo:
"""Update documents as a string in the index.

Expand All @@ -640,6 +651,9 @@ def update_documents_raw(
The primary-key used in index. Ignored if already set up.
type:
The type of document. Type available: 'csv', 'json', 'jsonl'
csv_delimiter:
One ASCII character used to customize the delimiter for CSV.
Note: The csv delimiter can only be used with the Content-Type text/csv.

Returns
-------
Expand All @@ -652,7 +666,7 @@ def update_documents_raw(
MeiliSearchApiError
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://docs.meilisearch.com/errors/#meilisearch-errors
"""
url = self._build_url(primary_key)
url = self._build_url(primary_key=primary_key, csv_delimiter=csv_delimiter)
response = self.http.put(url, str_documents, content_type)
return TaskInfo(**response)

Expand Down Expand Up @@ -1530,8 +1544,13 @@ def __settings_url_for(self, sub_route: str) -> str:
def _build_url(
self,
primary_key: Optional[str] = None,
csv_delimiter: Optional[str] = None,
) -> str:
if primary_key is None:
parameters = {}
if primary_key:
parameters["primaryKey"] = primary_key
if csv_delimiter:
parameters["csvDelimiter"] = csv_delimiter
if primary_key is None and csv_delimiter is None:
Comment on lines +1549 to +1554
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like that much to see these if conditions because they usually hide a code that will grow indefinitely, WDYT about this?

Suggested change
parameters = {}
if primary_key:
parameters["primaryKey"] = primary_key
if csv_delimiter:
parameters["csvDelimiter"] = csv_delimiter
if primary_key is None and csv_delimiter is None:
parameters = { "csvDelimiter": csv_delimiter, "primaryKey": primary_key }
parameters = dict((k, v) for k, v in parameters.items() if v)
if primary_key is None and csv_delimiter is None:

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will work, but I would argue it is less readable and not obvious what you are doing. If you do go this direction you could clean it up some:

parameters = {k: v for k, v in parameters.items() if v}

With either direction it could also be worth it to short circuit so parameters don't get created and run if not needed:

    def _build_url(
        self,
        primary_key: Optional[str] = None,
        csv_delimiter: Optional[str] = None,
    ) -> str:
        if primary_key is None and csv_delimiter is None:
            return f"{self.config.paths.index}/{self.uid}/{self.config.paths.document}"

           parameters = { "csvDelimiter": csv_delimiter, "primaryKey": primary_key }
           parameters = {k: v for k, v in parameters.items() if v}

        return f"{self.config.paths.index}/{self.uid}/{self.config.paths.document}?{parse.urlencode(parameters)}"

Copy link
Contributor Author

@alallema alallema Mar 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with sanders41. In general, I prefer to avoid using the magic one-liners of Python to avoid complex understanding.
And we should not have to add a new one. But if we do, I will change it for your code!

return f"{self.config.paths.index}/{self.uid}/{self.config.paths.document}"
primary_key = parse.urlencode({"primaryKey": primary_key})
return f"{self.config.paths.index}/{self.uid}/{self.config.paths.document}?{primary_key}"
return f"{self.config.paths.index}/{self.uid}/{self.config.paths.document}?{parse.urlencode(parameters)}"
sanders41 marked this conversation as resolved.
Show resolved Hide resolved
13 changes: 11 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,25 @@ def small_movies_json_file():
@fixture(scope="session")
def songs_csv():
"""
Runs once per session. Provides the content of songs.csv from read..
Runs once per session. Provides the content of songs.csv from read.
"""
with open("./datasets/songs.csv", encoding="utf-8") as song_csv_file:
return song_csv_file.read().encode("utf-8")


@fixture(scope="session")
def songs_csv_custom_separator():
"""
Runs once per session. Provides the content of songs_custom_delimiter.csv from read.
"""
with open("./datasets/songs_custom_delimiter.csv", encoding="utf-8") as song_csv_file:
return song_csv_file.read().encode("utf-8")


@fixture(scope="session")
def songs_ndjson():
"""
Runs once per session. Provides the content of songs.ndjson from read..
Runs once per session. Provides the content of songs.ndjson from read.
"""
with open("./datasets/songs.ndjson", encoding="utf-8") as song_ndjson_file:
return song_ndjson_file.read().encode("utf-8")
Expand Down
28 changes: 28 additions & 0 deletions tests/index/test_index_document_meilisearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,20 @@ def test_add_documents_csv(empty_index, songs_csv):
assert index.get_primary_key() == "id"


def test_add_documents_csv_with_delimiter(empty_index, songs_csv_custom_separator):
"""Tests adding new documents to a clean index."""
index = empty_index("csv-delimiter")
response = index.add_documents_csv(songs_csv_custom_separator, csv_delimiter=";")
assert isinstance(response, TaskInfo)
assert response.task_uid is not None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this really needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really, but all the other tests are like that, so I find it weird to remove it for this one.

task = index.wait_for_task(response.task_uid)
assert task.status == "succeeded"
alallema marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also the task status, if the wait did not work your test will fail anyway right?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's possible for the task to complete, but the status to be failed right? I think this is what she is testing, that it was successful?

Copy link
Member

@brunoocasali brunoocasali Mar 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I got the idea, and my point is the assertion has no practical value because if the task fails, the assert task.details["receivedDocuments"] == 53 will fail and also the following assertions after get_documents.

We have 6 assertions in this test case, but only the last two are checking what the use case wants to assert (+ the receivedDocuments assertion), which is to verify if the documents were indexed properly.

I know test code is supposed to be explicit instead of implicit, but in this case, I struggle to find good reasons to keep those assertions everywhere.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When you read this:

def test_add_documents_csv_with_delimiter(empty_index, songs_csv_custom_separator):
    """Tests adding new documents to a clean index."""

    index = empty_index("csv-delimiter")
    response = index.add_documents_csv(songs_csv_custom_separator, csv_delimiter=";")
    task = index.wait_for_task(response.task_uid)
    assert task.details["receivedDocuments"] == 53

    documents = index.get_documents().results
    assert documents[1].country == "Europe"
    assert documents[4].artist == "Elton John"

Could you let me know if you missed the removed assertions?

assert task.details["receivedDocuments"] == 20
documents = index.get_documents().results
assert documents[1].country == "Europe"
assert documents[4].artist == "Elton John"


def test_update_documents_csv(index_with_documents, songs_csv):
"""Tests updating a single document with csv string."""
index = index_with_documents()
Expand All @@ -208,6 +222,20 @@ def test_update_documents_csv(index_with_documents, songs_csv):
assert index.get_primary_key() == "id"


def test_update_documents_csv_with_delimiter(index_with_documents, songs_csv_custom_separator):
"""Tests adding new documents to a clean index."""
index = index_with_documents()
response = index.update_documents_csv(songs_csv_custom_separator, csv_delimiter=";")
assert isinstance(response, TaskInfo)
assert response.task_uid is not None
task = index.wait_for_task(response.task_uid)
assert task.status == "succeeded"
alallema marked this conversation as resolved.
Show resolved Hide resolved
assert task.details["receivedDocuments"] == 20
document = index.get_document("813645611")
assert document.country == "Europe"
assert document.artist == "Elton John"


def test_add_documents_json(empty_index, small_movies_json_file):
"""Tests adding new documents to a clean index."""
index = empty_index()
Expand Down
4 changes: 2 additions & 2 deletions tests/index/test_index_search_meilisearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


def test_basic_search(index_with_documents):
"""Tests search with an simple query."""
"""Tests search with a simple query."""
response = index_with_documents().search("How to Train Your Dragon")
assert isinstance(response, dict)
assert response["hits"][0]["id"] == "166428"
Expand Down Expand Up @@ -356,7 +356,7 @@ def test_phrase_search(index_with_documents):


def test_basic_search_on_nested_documents(index_with_documents, nested_movies):
"""Tests search with an simple query on nested fields."""
"""Tests search with a simple query on nested fields."""
response = index_with_documents("nested_fields_index", nested_movies).search("An awesome")
assert isinstance(response, dict)
assert response["hits"][0]["id"] == 5
Expand Down