From 57a385685d1b3018198aa2915447929154375cad Mon Sep 17 00:00:00 2001 From: Maud Royer Date: Fri, 6 Sep 2024 11:14:28 +0200 Subject: [PATCH] feat: better search Signed-off-by: Maud Royer --- README.md | 24 +- package.json | 2 +- src/app/rechercher/page.tsx | 232 +++++++++++++----- src/db/index.ts | 8 + .../migrations/20240904085535_searchIndex.ts | 26 ++ src/db/seeds/1725439927263_searchIndex.ts | 56 +++++ src/displayUtils.ts | 10 +- 7 files changed, 285 insertions(+), 73 deletions(-) create mode 100644 src/db/migrations/20240904085535_searchIndex.ts create mode 100644 src/db/seeds/1725439927263_searchIndex.ts diff --git a/README.md b/README.md index 7b2a6b8..b5aea00 100644 --- a/README.md +++ b/README.md @@ -35,13 +35,27 @@ de la base de données publique des médicaments. Celles-ci sont transmises sous la forme d'un dump `.sql` et d'un dossier contenant les images. -La base MySQL doit être restaurée depuis le dump. +Ces données sont stockées par MySQL, et doivent être restaurée +depuis le dump transmis par l'ANSM. La base de données MySQL +ne doit pas être modifiée, et doit rester un simple clone +de la base de données publique des médicaments. + +### Données spécifiques à l'application -Les images sont stockées dans la base de données -PostgreSQL, de l'application et peuvent -être chargée avec et doivent être copiées -avec [le module `seed` de Kysely](https://sillon.incubateur.net/docs/database-for-everything/file-storage/). +Info Médicament utilise une base de données PostgreSQL +pour stocker les données spécifiques à l'application : +* les images des notices (pour éviter d'avoir à les stocker dans un système de fichiers) +* les index de recherche plein texte + +Vous devez d'abord jouer les migrations pour créer les tables, +puis charger les données. La base MySQL doit être accessible préalablement. ```bash +# Créer les tables +kysele migrate:latest + +# Charger les images et les index de recherche +# Le chemin vers le dossier contenant les images des notices doit être spécifié +# avec la variable d'environnement LEAFLET_IMAGES LEAFLET_IMAGES=/path/to/folder kysely seed run ``` \ No newline at end of file diff --git a/package.json b/package.json index 4887c01..0297595 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,7 @@ "start": "next start", "lint": "next lint", "predev": "only-include-used-icons", - "prebuild": "only-include-used-icons" + "prebuild": "only-include-used-icons && kysely migrate:latest && kysely seed run --specific 1725439927263_searchIndex" }, "dependencies": { "@codegouvfr/react-dsfr": "^1.9.22", diff --git a/src/app/rechercher/page.tsx b/src/app/rechercher/page.tsx index 36a0cb7..d36d226 100644 --- a/src/app/rechercher/page.tsx +++ b/src/app/rechercher/page.tsx @@ -1,41 +1,148 @@ import Link from "next/link"; +import { sql } from "kysely"; import Button from "@codegouvfr/react-dsfr/Button"; import Input from "@codegouvfr/react-dsfr/Input"; import { pdbmMySQL, Specialite, SubstanceNom } from "@/db/pdbmMySQL"; import { fr } from "@codegouvfr/react-dsfr"; import Badge from "@codegouvfr/react-dsfr/Badge"; +import db, { SearchResult } from "@/db"; import { formatSpecName, groupSpecialites } from "@/displayUtils"; import liste_CIS_MVP from "@/liste_CIS_MVP.json"; -async function getResults(query: string) { - const specialites: Specialite[] = ( - await pdbmMySQL - .selectFrom("Specialite") - .where("SpecDenom01", "like", `%${query}%`) - .selectAll() - .execute() - ).filter((specialite) => liste_CIS_MVP.includes(specialite.SpecId)); +type SearchResultItem = + | SubstanceNom + | { groupName: string; specialites: Specialite[] }; - const substances: SubstanceNom[] = await pdbmMySQL - .selectFrom("Subs_Nom") - .where(({ eb, selectFrom }) => - eb( - "NomId", - "in", - selectFrom("Composant") - .select("NomId") - .where("SpecId", "in", liste_CIS_MVP), - ), - ) - .where("NomLib", "like", `%${query}%`) +async function getSpecialites(specialitesId: string[], substancesId: string[]) { + return specialitesId.length + ? await pdbmMySQL + .selectFrom("Specialite") + .leftJoin("Composant", "Specialite.SpecId", "Composant.SpecId") + .where(({ eb }) => + eb.or([ + eb("Specialite.SpecId", "in", specialitesId), + eb("Composant.NomId", "in", substancesId), + ]), + ) + .where("Specialite.SpecId", "in", liste_CIS_MVP) + .selectAll("Specialite") + .select("NomId") + .execute() + : []; +} + +async function getSubstances(substancesId: string[]) { + const substances: SubstanceNom[] = substancesId.length + ? await pdbmMySQL + .selectFrom("Subs_Nom") + .where("NomId", "in", substancesId) + .where(({ eb, selectFrom }) => + eb( + "NomId", + "in", + selectFrom("Composant") + .select("NomId") + .where("SpecId", "in", liste_CIS_MVP), + ), + ) + .selectAll() + .execute() + : []; + return substances; +} + +/** + * Get search results from the database + * + * The search results are generated and ordered by the following rules: + * 1. We get all substances and specialites matches from the search_index table + * 2. We retrieve all substances, all direct match for specialities, + * and all specialities that have a match with a substance + * 3. We group the specialities by their group name + * 4. The score of each result is the word similarity between the search query and the token, + * for specialities, we sum direct match score and substance match score + */ +async function getResults(query: string): Promise { + const dbQuery = db + .selectFrom("search_index") .selectAll() - .execute(); + .select(({ fn, val }) => [ + fn("word_similarity", [val(query), "token"]).as("sml"), + ]) + .where("token", sql`%>`, query) + .orderBy("sml", "desc") + .orderBy(({ fn }) => fn("length", ["token"])); + + const matches = (await dbQuery.execute()) as (SearchResult & { + sml: number; + })[]; + + if (matches.length === 0) return []; + + const specialitesId = matches + .filter((r) => r.table_name === "Specialite") + .map((r) => r.id); + const substancesId = matches + .filter((r) => r.table_name === "Subs_Nom") + .map((r) => r.id); - return { - specialites, - substances, - }; + const specialites = await getSpecialites(specialitesId, substancesId); + const specialiteGroups = Array.from(groupSpecialites(specialites).entries()); + const substances = await getSubstances(substancesId); + + return matches + .reduce((acc: { score: number; item: SearchResultItem }[], match) => { + if (match.table_name === "Subs_Nom") { + const substance = substances.find( + (s) => s.NomId.trim() === match.id.trim(), + ); // if undefined, the substance is not in one of the 500 CIS list + if (substance) { + acc.push({ score: match.sml, item: substance }); + + specialiteGroups + .filter(([, specialites]) => + specialites.find( + (s) => s.NomId && s.NomId.trim() === substance.NomId.trim(), + ), + ) + .forEach(([groupName, specialites]) => { + if ( + !acc.find((a) => "groupName" in a && a.groupName === groupName) + ) { + let directMatch = matches.find( + (m) => + m.table_name === "Specialite" && + specialites.find((s) => s.SpecId.trim() === m.id.trim()), + ); + acc.push({ + score: directMatch ? directMatch.sml + match.sml : match.sml, + item: { groupName, specialites }, + }); + } + }); + } + } + + if (match.table_name === "Specialite") { + const specialiteGroup = specialiteGroups.find(([, specialites]) => + specialites.find((s) => s.SpecId.trim() === match.id.trim()), + ); // if undefined, the specialite is not in the 500 CIS list + if ( + specialiteGroup && + !acc.find( + (a) => "groupName" in a && a.groupName === specialiteGroup[0], + ) + ) { + const [groupName, specialites] = specialiteGroup; + acc.push({ score: match.sml, item: { groupName, specialites } }); + } + } + + return acc; + }, []) + .sort((a, b) => b.score - a.score) + .map(({ item }) => item); } export default async function Page({ @@ -70,46 +177,47 @@ export default async function Page({ {results && ( <> -

- {results.substances.length + results.specialites.length} RÉSULTATS -

+

{results.length} RÉSULTATS

    - {results.substances.map((substance: SubstanceNom) => ( -
  • - - {formatSpecName(substance.NomLib)} - - - Substance - + {results.map((result, index) => ( +
  • + {"NomLib" in result ? ( + <> + + {formatSpecName(result.NomLib)} + + + Substance + + + ) : ( + <> + {formatSpecName(result.groupName)} + + Médicament + +
      + {result.specialites?.map((specialite) => ( +
    • + + {formatSpecName(specialite.SpecDenom01) + .replace( + `${formatSpecName(result.groupName)}, `, + "", + ) + .replace(formatSpecName(result.groupName), "")} + +
    • + ))} +
    + + )}
  • ))} - {Array.from(groupSpecialites(results.specialites).entries()).map( - ([groupName, specialites]: [string, Specialite[]]) => ( -
  • - {formatSpecName(groupName)} - - Médicament - -
      - {specialites?.map((specialite) => ( -
    • - - {formatSpecName(specialite.SpecDenom01).replace( - formatSpecName(groupName), - "", - )} - -
    • - ))} -
    -
  • - ), - )}
)} diff --git a/src/db/index.ts b/src/db/index.ts index 4046801..892d6c0 100644 --- a/src/db/index.ts +++ b/src/db/index.ts @@ -4,9 +4,16 @@ import { Kysely, NoResultError, PostgresDialect, Selectable } from "kysely"; import { Pool } from "pg"; interface Database { + search_index: SearchIndexTable; leaflet_images: LeafletImagesTable; } +interface SearchIndexTable { + token: string; + table_name: "Specialite" | "Subs_Nom"; + id: string; +} + interface LeafletImagesTable { path: string; image: Buffer; @@ -34,6 +41,7 @@ export const getLeafletImage = async ({ src }: { src: string }) => { }; export type LeafletImage = Selectable; +export type SearchResult = Selectable; const db = new Kysely({ dialect: new PostgresDialect({ diff --git a/src/db/migrations/20240904085535_searchIndex.ts b/src/db/migrations/20240904085535_searchIndex.ts new file mode 100644 index 0000000..871f40c --- /dev/null +++ b/src/db/migrations/20240904085535_searchIndex.ts @@ -0,0 +1,26 @@ +import { Kysely, sql } from "kysely"; + +export async function up(db: Kysely): Promise { + await sql`CREATE EXTENSION IF NOT EXISTS pg_trgm`.execute(db); + await sql`CREATE EXTENSION IF NOT EXISTS unaccent`.execute(db); + + await db.schema + .createTable("search_index") + .addColumn("token", "text") + .addColumn("table_name", "text") + .addColumn("id", "text") + .execute(); + + await db.schema + .createIndex("search_index_trgm") + .on("search_index") + .using("GIN (token gin_trgm_ops)") + .execute(); +} + +export async function down(db: Kysely): Promise { + await db.schema.dropTable("search_index").execute(); + + await sql`DROP EXTENSION IF EXISTS unaccent`.execute(db); + await sql`DROP EXTENSION IF EXISTS pg_trgm`.execute(db); +} diff --git a/src/db/seeds/1725439927263_searchIndex.ts b/src/db/seeds/1725439927263_searchIndex.ts new file mode 100644 index 0000000..eea2956 --- /dev/null +++ b/src/db/seeds/1725439927263_searchIndex.ts @@ -0,0 +1,56 @@ +import type { Kysely } from "kysely"; +import { pdbmMySQL } from "@/db/pdbmMySQL"; + +export async function seed(db: Kysely): Promise { + // Get substances from PMDB + const substances = await pdbmMySQL + .selectFrom("Subs_Nom") + .select(["NomLib", "NomId"]) + .execute(); + + await db.transaction().execute(async (db) => { + await db + .deleteFrom("search_index") + .where("table_name", "=", "Subs_Nom") + .execute(); + + // Insert substances into search_index + for (const substance of substances) { + await db + .insertInto("search_index") + .values(({ fn, val }) => ({ + token: fn("unaccent", [val(substance.NomLib)]), + table_name: "Subs_Nom", + id: substance.NomId, + })) + .execute(); + } + }); + + // Get specialities from PMDB + const specialities = await pdbmMySQL + .selectFrom("Specialite") + .select(["SpecDenom01", "SpecId"]) + .execute(); + + await db.transaction().execute(async (db) => { + await db + .deleteFrom("search_index") + .where("table_name", "=", "Specialite") + .execute(); + + // Insert specialities into search_index + for (const specialite of specialities) { + await db + .insertInto("search_index") + .values(({ fn, val }) => ({ + token: fn("unaccent", [val(specialite.SpecDenom01)]), + table_name: "Specialite", + id: specialite.SpecId, + })) + .execute(); + } + }); + + await pdbmMySQL.destroy(); +} diff --git a/src/displayUtils.ts b/src/displayUtils.ts index 703ff99..80667b6 100644 --- a/src/displayUtils.ts +++ b/src/displayUtils.ts @@ -18,14 +18,14 @@ export const formatSpecName = (name: string): string => .join(" "); export function getSpecialiteGroupName(specialite: Specialite): string { - const regexMatch = specialite.SpecDenom01.match(/^[^0-9]+/); + const regexMatch = specialite.SpecDenom01.match(/^[^0-9,]+/); return regexMatch ? regexMatch[0] : specialite.SpecDenom01; } -export function groupSpecialites( - specialites: Specialite[], -): Map { - const groups = new Map(); +export function groupSpecialites( + specialites: T[], +): Map { + const groups = new Map(); for (const specialite of specialites) { const groupName = getSpecialiteGroupName(specialite); if (groups.has(groupName)) {