Skip to content

Commit

Permalink
feat: add new experimental parser
Browse files Browse the repository at this point in the history
Closes: #1178
  • Loading branch information
aalemayhu committed Jul 14, 2023
1 parent 10518f0 commit c12d7a2
Show file tree
Hide file tree
Showing 9 changed files with 307 additions and 74 deletions.
7 changes: 3 additions & 4 deletions src/lib/getPackagesFromZip.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@ import { Body } from 'aws-sdk/clients/s3';
import { ZipHandler } from './anki/zip';
import { PrepareDeck } from './parser/DeckParser';
import Package from './parser/Package';
import { isHTMLFile, hasMarkdownFileName } from './storage/checks';
import Settings from './parser/Settings';
import { isHTMLFile } from './storage/checks';

export interface PackageResult {
packages: Package[];
containsMarkdown: boolean;
}

export const getPackagesFromZip = async (
Expand All @@ -19,7 +18,7 @@ export const getPackagesFromZip = async (
const packages = [];

if (!fileContents) {
return { packages: [], containsMarkdown: false };
return { packages: [] };
}

zipHandler.build(fileContents as Uint8Array, isPatreon);
Expand All @@ -36,5 +35,5 @@ export const getPackagesFromZip = async (
}
}

return { packages, containsMarkdown: hasMarkdownFileName(fileNames) };
return { packages };
};
16 changes: 15 additions & 1 deletion src/lib/parser/DeckParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import getYouTubeEmbedLink from './helpers/getYouTubeEmbedLink';
import getUniqueFileName from '../misc/getUniqueFileName';
import { isValidAudioFile } from '../anki/format';
import { sendError } from '../error/sendError';
import FallbackParser from './experimental/FallbackParser';

export class DeckParser {
globalTags: cheerio.Cheerio | null;
Expand Down Expand Up @@ -536,6 +537,17 @@ export class DeckParser {
exporter.configure(this.payload);
return exporter.save();
}

tryExperimental() {
const fallback = new FallbackParser(this.files);
const ws = new Workspace(true, 'fs');
const exporter = this.setupExporter(this.payload, ws.location);

const payload = fallback.run(this.settings);
payload[0].settings = this.settings;
exporter.configure(payload);
return exporter.save();
}
}

export async function PrepareDeck(
Expand All @@ -545,8 +557,10 @@ export async function PrepareDeck(
) {
const parser = new DeckParser(fileName, settings, files);
const total = parser.payload.map((p) => p.cardCount).reduce((a, b) => a + b);

if (total === 0) {
return null;
const apkg = await parser.tryExperimental();
return { name: `${parser.name}.apkg`, apkg };
}

const apkg = await parser.build();
Expand Down
89 changes: 89 additions & 0 deletions src/lib/parser/experimental/FallbackParser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import cheerio from 'cheerio';

import { File } from '../../anki/zip';
import Deck from '../Deck';
import Note from '../Note';
import Settings from '../Settings';
import { PlainTextParser } from './PlainTextParser/PlainTextParser';
import { isBasicFlashcard, isClozeFlashcard } from './PlainTextParser/types';

class FallbackParser {
constructor(private readonly files: File[]) {}

htmlToTextWithNewlines(html: string) {
const $ = cheerio.load(html);

function processListItems(items: cheerio.Cheerio) {
let result = '';
items.each((_, element) => {
const itemText = $(element).text().trim();
result += `• ${itemText}\n`;
});
return result;
}

const elem = $('ul, ol');
let items: string[] = [];
elem.each((_, element) => {
const listItems = $(element).find('li');
const listText = processListItems(listItems);
items.push(listText);
});

return items;
}

getTitleFromHTML(html: string) {
const $ = cheerio.load(html);
return $('title').text().trim();
}

getStyleTagFromString(html: string) {
const $ = cheerio.load(html);
const styleTag = $('style');

if (styleTag.length === 0) {
return ''; // No style tag found, return an empty string
}

return styleTag.text() ?? '';
}

run(settings: Settings) {
const decks = [];
for (const file of this.files) {
const contents = file.contents?.toString();
if (!contents) {
continue;
}
const plainText = this.htmlToTextWithNewlines(contents);
const plainTextParser = new PlainTextParser();
const found = plainTextParser.parse(plainText.join('\n'));

const cards: Note[] = found.filter(Boolean).map((card, index) => {
const note = new Note(card.front, '');
note.number = index;
if (isClozeFlashcard(card)) {
note.cloze = true;
} else {
note.back = card.back;
}
return note;
});

decks.push(
new Deck(
this.getTitleFromHTML(contents),
cards,
'', // skip cover image
'', // skip style
Deck.GenerateId(),
settings
)
);
}
return decks;
}
}

export default FallbackParser;
63 changes: 63 additions & 0 deletions src/lib/parser/experimental/PlainTextParser/PlainTextParser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import {
BasicCard,
ClozeCard,
Flashcard,
isPossiblyClozeFlashcard,
} from './types';

export class PlainTextParser {
getOneOrMoreAnswers(answers: string): string[] {
const answerList = answers.split(', ');
if (!answerList || answerList.length === 0) {
return [answers];
}
return answerList;
}

fillInTheBlanks(sentence: string, answers: string): ClozeCard {
const answerList = this.getOneOrMoreAnswers(answers);
let clozeSentence = sentence;

for (let i = 0; i < answerList.length; i++) {
clozeSentence = clozeSentence.replace(
/_{1,}/,
`{{c${i + 1}::${answerList[i]}}}`
);
}

return {
front: clozeSentence,
isCloze: true,
};
}

getBasicFlashcard(flashcardText: string): BasicCard {
const [front, back] = flashcardText.split(' - ');

return {
front: front,
back: back,
};
}

parse(input: string): Flashcard[] {
const flashcards = [];
const bulletPoints = input.split(/\n\n|\n- /);

for (const bulletPoint of bulletPoints) {
const [question, answers] = bulletPoint.split(' - ');

if (isPossiblyClozeFlashcard(question)) {
const cards = this.fillInTheBlanks(question, answers);
if (cards) {
flashcards.push(cards);
}
continue;
}

flashcards.push(this.getBasicFlashcard(bulletPoint));
}

return flashcards;
}
}
27 changes: 27 additions & 0 deletions src/lib/parser/experimental/PlainTextParser/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
export interface ClozeCard {
isCloze: boolean;
front: string;
extra?: string;
}

export interface BasicCard {
front: string;
back: string;
tags?: string;
}

export type Flashcard = ClozeCard | BasicCard;

export const isClozeFlashcard = (
flashcard: Flashcard
): flashcard is ClozeCard =>
'isCloze' in flashcard && flashcard.isCloze === true;

export const isBasicFlashcard = (
flashcard: Flashcard
): flashcard is BasicCard =>
'back' in flashcard && flashcard.back !== undefined;

export const isPossiblyClozeFlashcard = (question: string) => {
return question.includes('_') && question.split('-');
};
85 changes: 16 additions & 69 deletions src/services/UploadService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,20 @@ import express from 'express';
import multer from 'multer';
import multerS3 from 'multer-s3';

import UploadRepository from '../data_layer/UploadRespository';
import { BytesToMegaBytes } from '../lib/misc/file';
import { getUploadLimits } from '../lib/misc/getUploadLimits';
import StorageHandler from '../lib/storage/StorageHandler';
import { UploadedFile } from '../lib/storage/types';
import { sendBundle } from '../controllers/UploadController';
import { getOwner } from '../lib/User/getOwner';
import UploadRepository from '../data_layer/UploadRespository';
import { sendError } from '../lib/error/sendError';
import { getPackagesFromZip } from '../lib/getPackagesFromZip';
import ErrorHandler, {
UNSUPPORTED_FORMAT_MD,
NO_PACKAGE_ERROR,
UNSUPPORTED_FORMAT_MD,
} from '../lib/misc/ErrorHandler';
import { PrepareDeck } from '../lib/parser/DeckParser';
import Package from '../lib/parser/Package';
import { BytesToMegaBytes } from '../lib/misc/file';
import { getUploadLimits } from '../lib/misc/getUploadLimits';
import Settings from '../lib/parser/Settings';
import {
hasMarkdownFileName,
isHTMLFile,
isZIPFile,
} from '../lib/storage/checks';
import StorageHandler from '../lib/storage/StorageHandler';
import GeneratePackagesUseCase from '../usecases/uploads/GeneratePackagesUseCase';
import { toText } from './NotionService/BlockHandler/helpers/deckNameToText';
import { UploadedFile } from '../lib/storage/types';

class UploadService {
getUploadsByOwner(owner: number) {
Expand All @@ -38,21 +30,6 @@ class UploadService {
await s.delete(key);
}

registerUploadSize(file: UploadedFile, owner?: number) {
const { originalname, key, size } = file;

if (!owner) {
return;
}

return this.uploadRepository.update(
owner,
originalname,
key,
BytesToMegaBytes(size)
);
}

getUploadHandler(res: express.Response, storage: StorageHandler) {
return multer({
limits: getUploadLimits(res.locals.patreon),
Expand Down Expand Up @@ -80,42 +57,16 @@ class UploadService {
res: express.Response
) {
try {
const files = req.files as UploadedFile[];
let packages: Package[] = [];
let hasMarkdown: boolean = hasMarkdownFileName(
files.map((file) => file.originalname)
);
for (const file of files) {
const filename = file.originalname;
const settings = new Settings(req.body || {});

await this.registerUploadSize(file, getOwner(res));
const key = file.key;
const fileContents = await storage.getFileContents(key);

if (isHTMLFile(filename)) {
const d = await PrepareDeck(
filename,
[{ name: filename, contents: fileContents.Body }],
settings
);
if (d) {
const pkg = new Package(d.name, d.apkg);
packages = packages.concat(pkg);
}
} else if (isZIPFile(filename) || isZIPFile(key)) {
const { packages: extraPackages, containsMarkdown } =
await getPackagesFromZip(
fileContents.Body,
res.locals.patreon,
settings
);
packages = packages.concat(extraPackages);
hasMarkdown = containsMarkdown;
}
}
let payload;
let plen;
const settings = new Settings(req.body || {});

const useCase = new GeneratePackagesUseCase(storage);
const packages = await useCase.execute(
res.locals.patreon,
req.files as UploadedFile[],
settings
);

const first = packages[0];
if (packages.length === 1) {
Expand All @@ -141,11 +92,7 @@ class UploadService {
await sendBundle(packages, res);
console.info('Sent bundle with %d packages', packages.length);
} else {
if (hasMarkdown) {
ErrorHandler(res, UNSUPPORTED_FORMAT_MD);
} else {
ErrorHandler(res, NO_PACKAGE_ERROR);
}
ErrorHandler(res, NO_PACKAGE_ERROR);
}
} catch (err) {
sendError(err);
Expand Down
Loading

0 comments on commit c12d7a2

Please sign in to comment.