-
-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: adding fallbacks for bullet lists (#1180)
The code is ugly as hell but this adds a new fallback parser. For power users who already have an established workflow, this will not introduce any issues but only be triggered when their upload would not create any flashcards. It relies heavily on the format suggested here #1178: The fallback parser takes a bullet list as input and generates one or more flashcards for each bullet point. The bullet points are separated by a blank line (\n). In the bullet point, the question and answer are separated by a dash (-). If there are more answers, f.ex. in the close deletion case, then answers are comma delimited after the dash (answer1, answer, etc.) ## Examples ### Cloze card ``` The capital of France is _____. - Paris. ``` That generates the following flashcard: ``` Front of the card: The capital of France is {{c1::Paris}}. ``` ### Basic card Input: ``` What is the capital of Kosovo? - Pristina ``` Output: ``` Front of the card: What is the capital of Kosovo? Back of the card: Pristina ``` ## Supported formats Note this does not work in the Notion integration but is a fallback mechanism added for people uploading files that are not handled by the current DeckParser. So I am hoping people see the error message about no cards less often with this change. This is a fallback so things like styling and images do not work. Markdown, HTML, and ZIP support work for the fallback parser. But note that there might be some unexpected issues like the styling is not applied and there are cases where the front gets duplicated content. Enjoy!
- Loading branch information
Showing
14 changed files
with
434 additions
and
134 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,40 +0,0 @@ | ||
import { Body } from 'aws-sdk/clients/s3'; | ||
import { ZipHandler } from './anki/zip'; | ||
import { PrepareDeck } from './parser/DeckParser'; | ||
import Package from './parser/Package'; | ||
import { isHTMLFile, hasMarkdownFileName } from './storage/checks'; | ||
import Settings from './parser/Settings'; | ||
|
||
export interface PackageResult { | ||
packages: Package[]; | ||
containsMarkdown: boolean; | ||
} | ||
|
||
export const getPackagesFromZip = async ( | ||
fileContents: Body | undefined, | ||
isPatreon: boolean, | ||
settings: Settings | ||
): Promise<PackageResult> => { | ||
const zipHandler = new ZipHandler(); | ||
const packages = []; | ||
|
||
if (!fileContents) { | ||
return { packages: [], containsMarkdown: false }; | ||
} | ||
|
||
zipHandler.build(fileContents as Uint8Array, isPatreon); | ||
|
||
const fileNames = zipHandler.getFileNames(); | ||
|
||
for (const fileName of fileNames) { | ||
if (isHTMLFile(fileName)) { | ||
const deck = await PrepareDeck(fileName, zipHandler.files, settings); | ||
|
||
if (deck) { | ||
packages.push(new Package(deck.name, deck.apkg)); | ||
} | ||
} | ||
} | ||
|
||
return { packages, containsMarkdown: hasMarkdownFileName(fileNames) }; | ||
}; | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
import cheerio from 'cheerio'; | ||
|
||
import { File } from '../../anki/zip'; | ||
import { isHTMLFile, isMarkdownFile, isPlainText } from '../../storage/checks'; | ||
import Deck from '../Deck'; | ||
import Note from '../Note'; | ||
import Settings from '../Settings'; | ||
import { PlainTextParser } from './PlainTextParser/PlainTextParser'; | ||
import { Flashcard, isClozeFlashcard } from './PlainTextParser/types'; | ||
|
||
class FallbackParser { | ||
constructor(private readonly files: File[]) {} | ||
|
||
htmlToTextWithNewlines(html: string) { | ||
const $ = cheerio.load(html); | ||
|
||
function processListItems(items: cheerio.Cheerio) { | ||
let result = ''; | ||
items.each((_, element) => { | ||
const itemText = $(element).text().trim(); | ||
result += `• ${itemText}\n`; | ||
}); | ||
return result; | ||
} | ||
|
||
const elem = $('ul, ol'); | ||
let items: string[] = []; | ||
elem.each((_, element) => { | ||
const listItems = $(element).find('li'); | ||
const listText = processListItems(listItems); | ||
items.push(listText); | ||
}); | ||
|
||
return items; | ||
} | ||
|
||
getTitleFromHTML(html: string) { | ||
const $ = cheerio.load(html); | ||
return $('title').text().trim(); | ||
} | ||
|
||
getStyleTagFromString(html: string) { | ||
const $ = cheerio.load(html); | ||
const styleTag = $('style'); | ||
|
||
if (styleTag.length === 0) { | ||
return ''; // No style tag found, return an empty string | ||
} | ||
|
||
return styleTag.text() ?? ''; | ||
} | ||
|
||
getMarkdownBulletLists(markdown: string) { | ||
const bulletListRegex = /[-*+]( .*)+/g; | ||
return markdown.match(bulletListRegex); | ||
} | ||
|
||
/** | ||
* Return the correct title from markdown | ||
* | ||
* Notion can have two titles in Markdown files. | ||
* The first one is the title with a the id of the page. | ||
* The second one is the title of the page only. | ||
* | ||
* @param markdown user input markdown | ||
* @returns deck title | ||
*/ | ||
getTitleMarkdown(markdown: string) { | ||
const headingRegex = /^(#{1,6})\s+(.*)$/gm; | ||
const matches = [...markdown.matchAll(headingRegex)]; | ||
if (matches.length >= 2) { | ||
return matches[1][2]; // return second match | ||
} else if (matches.length > 0) { | ||
return matches[0][2]; | ||
} | ||
return 'Default'; | ||
} | ||
|
||
mapCardsToNotes(cards: Flashcard[]): Note[] { | ||
return cards.filter(Boolean).map((card, index) => { | ||
const note = new Note(card.front, ''); | ||
note.number = index; | ||
if (isClozeFlashcard(card)) { | ||
note.cloze = true; | ||
} else { | ||
note.back = card.back; | ||
|
||
if (!note.back || note.back.trim().length === 0) { | ||
const parts = note.name.split('\n'); | ||
if (parts.length > 1) { | ||
note.name = parts[0]; | ||
note.back = parts.slice(1).join('\n'); | ||
} | ||
} | ||
} | ||
return note; | ||
}); | ||
} | ||
|
||
run(settings: Settings) { | ||
const decks = []; | ||
for (const file of this.files) { | ||
const contents = file.contents?.toString(); | ||
if (!contents) { | ||
continue; | ||
} | ||
|
||
let cards: Note[] = []; | ||
let deckName = 'Untitled'; | ||
if (isHTMLFile(file.name)) { | ||
const plainText = this.htmlToTextWithNewlines(contents).join('\n'); | ||
const plainTextParser = new PlainTextParser(); | ||
const found = plainTextParser.parse(plainText); | ||
cards = this.mapCardsToNotes(found); | ||
deckName = this.getTitleFromHTML(contents); | ||
} else if (isMarkdownFile(file.name) || isPlainText(file.name)) { | ||
const plainTextParser = new PlainTextParser(); | ||
const items = this.getMarkdownBulletLists(contents); | ||
if (!items) { | ||
continue; | ||
} | ||
const found = plainTextParser.parse(items.join('\n')); | ||
cards = this.mapCardsToNotes(found); | ||
deckName = this.getTitleMarkdown(contents); | ||
} | ||
|
||
decks.push( | ||
new Deck( | ||
deckName, | ||
Deck.CleanCards(cards), | ||
'', // skip cover image | ||
'', // skip style | ||
Deck.GenerateId(), | ||
settings | ||
) | ||
); | ||
} | ||
return decks; | ||
} | ||
} | ||
|
||
export default FallbackParser; |
63 changes: 63 additions & 0 deletions
63
src/lib/parser/experimental/PlainTextParser/PlainTextParser.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import { | ||
BasicCard, | ||
ClozeCard, | ||
Flashcard, | ||
isPossiblyClozeFlashcard, | ||
} from './types'; | ||
|
||
export class PlainTextParser { | ||
getOneOrMoreAnswers(answers: string): string[] { | ||
const answerList = answers.split(', '); | ||
if (!answerList || answerList.length === 0) { | ||
return [answers]; | ||
} | ||
return answerList; | ||
} | ||
|
||
fillInTheBlanks(sentence: string, answers: string): ClozeCard { | ||
const answerList = this.getOneOrMoreAnswers(answers); | ||
let clozeSentence = sentence; | ||
|
||
for (let i = 0; i < answerList.length; i++) { | ||
clozeSentence = clozeSentence.replace( | ||
/_+/, | ||
`{{c${i + 1}::${answerList[i]}}}` | ||
); | ||
} | ||
|
||
return { | ||
front: clozeSentence, | ||
isCloze: true, | ||
}; | ||
} | ||
|
||
getBasicFlashcard(flashcardText: string): BasicCard { | ||
const [front, back] = flashcardText.split(' - '); | ||
|
||
return { | ||
front: front, | ||
back: back, | ||
}; | ||
} | ||
|
||
parse(input: string): Flashcard[] { | ||
const flashcards = []; | ||
const bulletPoints = input.split(/\n\n|\n- /); | ||
|
||
for (const bulletPoint of bulletPoints) { | ||
const [question, answers] = bulletPoint.split(' - '); | ||
|
||
if (isPossiblyClozeFlashcard(question)) { | ||
const cards = this.fillInTheBlanks(question, answers); | ||
if (cards) { | ||
flashcards.push(cards); | ||
} | ||
continue; | ||
} | ||
|
||
flashcards.push(this.getBasicFlashcard(bulletPoint)); | ||
} | ||
|
||
return flashcards; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
export interface ClozeCard { | ||
isCloze: boolean; | ||
front: string; | ||
extra?: string; | ||
} | ||
|
||
export interface BasicCard { | ||
front: string; | ||
back: string; | ||
tags?: string; | ||
} | ||
|
||
export type Flashcard = ClozeCard | BasicCard; | ||
|
||
export const isClozeFlashcard = ( | ||
flashcard: Flashcard | ||
): flashcard is ClozeCard => | ||
'isCloze' in flashcard && flashcard.isCloze === true; | ||
|
||
export const isBasicFlashcard = ( | ||
flashcard: Flashcard | ||
): flashcard is BasicCard => | ||
'back' in flashcard && flashcard.back !== undefined; | ||
|
||
export const isPossiblyClozeFlashcard = (question: string) => { | ||
return question.includes('_') && question.split('-'); | ||
}; |
Oops, something went wrong.