Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Improve node creation performance #338

Merged
merged 1 commit into from
Jan 17, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 32 additions & 25 deletions parser/openfoodfacts_taxonomy_parser/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,24 +32,28 @@ def _get_project_name(self, taxonomy_name: str, branch_name: str):

def _create_other_node(self, tx: Transaction, node_data: NodeData, project_label: str):
"""Create a TEXT, SYNONYMS or STOPWORDS node"""
position_query = """
SET n.id = $id
SET n.preceding_lines = $preceding_lines
SET n.src_position = $src_position
"""
if node_data.get_node_type() == NodeType.TEXT:
id_query = f"CREATE (n:{project_label}:TEXT) \n"
type_label = "TEXT"
elif node_data.get_node_type() == NodeType.SYNONYMS:
id_query = f"CREATE (n:{project_label}:SYNONYMS) \n"
type_label = "SYNONYMS"
elif node_data.get_node_type() == NodeType.STOPWORDS:
id_query = f"CREATE (n:{project_label}:STOPWORDS) \n"
type_label = "STOPWORDS"
else:
raise ValueError(f"ENTRY nodes should not be passed to this function")

entry_queries = [f"SET n.{key} = ${key}" for key in node_data.tags]
entry_query = "\n".join(entry_queries) + "\n"
node_tags_queries = [f"{key} : ${key}" for key in node_data.tags]

base_properties_query = """
id: $id,
preceding_lines: $preceding_lines,
src_position: $src_position
"""

query = id_query + entry_query + position_query
properties_query = ",\n".join([base_properties_query, *node_tags_queries])
eric-nguyen-cs marked this conversation as resolved.
Show resolved Hide resolved

query = f"""
CREATE (n:{project_label}:{type_label} {{ {properties_query} }})
"""
tx.run(query, node_data.to_dict())

def _create_other_nodes(self, other_nodes: list[NodeData], project_label: str):
Expand All @@ -70,16 +74,6 @@ def _create_entry_nodes(self, entry_nodes: list[NodeData], project_label: str):
self.parser_logger.info("Creating ENTRY nodes")
start_time = timeit.default_timer()

base_query = f"""
WITH $entry_nodes as entry_nodes
UNWIND entry_nodes as entry_node
CREATE (n:{project_label}:ENTRY)
SET n.id = entry_node.id
SET n.preceding_lines = entry_node.preceding_lines
SET n.src_position = entry_node.src_position
SET n.main_language = entry_node.main_language
"""

# we don't know in advance which properties and tags
# we will encounter in the batch
# so we accumulate them in this set
Expand All @@ -91,11 +85,24 @@ def _create_entry_nodes(self, entry_nodes: list[NodeData], project_label: str):
seen_properties_and_tags.update(entry_node.tags)
seen_properties_and_tags.update(entry_node.properties)

additional_query = "\n" + "\n".join(
[f"SET n.{key} = entry_node.{key}" for key in seen_properties_and_tags]
)
additional_properties_queries = [
f"{key} : entry_node.{key}" for key in seen_properties_and_tags
]

base_properties_query = f"""
id: entry_node.id,
preceding_lines: entry_node.preceding_lines,
src_position: entry_node.src_position,
main_language: entry_node.main_language
"""

properties_query = ",\n".join([base_properties_query, *additional_properties_queries])

query = base_query + additional_query
query = f"""
WITH $entry_nodes as entry_nodes
UNWIND entry_nodes as entry_node
CREATE (n:{project_label}:ENTRY {{ {properties_query} }})
"""
self.session.run(query, entry_nodes=[entry_node.to_dict() for entry_node in entry_nodes])

self.parser_logger.info(
Expand Down