diff --git a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/splitIntoTokensCustomSpec.js b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/splitIntoTokensCustomSpec.js new file mode 100644 index 00000000000..dd6f0f7cef7 --- /dev/null +++ b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/splitIntoTokensCustomSpec.js @@ -0,0 +1,57 @@ +import splitIntoTokensCustom from "../../../../../src/languageProcessing/languages/ja/helpers/splitIntoTokensCustom"; + +const testcases = [ + { + description: "should return an empty result sentence is empty", + sentence: { text: "", sourceCodeRange: { startOffset: 0, endOffset: 0 } }, + expected: [], + }, + { + description: "should correctly tokenize a simple Japanese sentence wouthout punctuations", + sentence: { + text: "犬が大好き", + sourceCodeRange: { startOffset: 0, endOffset: 5 }, + }, + expected: [ "犬", "が", "大好き" ], + }, + { + description: "should correctly tokenize a Japanese sentence with japanese punctuations", + sentence: { + text: "犬が大好き\u3002", + sourceCodeRange: { startOffset: 0, endOffset: 6 }, + }, + expected: [ "犬", "が", "大好き", "。" ], + + }, + { + description: "should correctly tokenize a Japanese sentence with english punctuations", + sentence: { + text: "犬が大好き.", + sourceCodeRange: { startOffset: 0, endOffset: 6 }, + }, + expected: [ "犬", "が", "大好き", "." ], + }, + { + description: "should correctly tokenize a Japanese sentence with quotation marks inside", + sentence: { + text: "犬「が」大好き\u3002", + sourceCodeRange: { startOffset: 0, endOffset: 8 }, + }, + expected: [ "犬", "「", "が", "」", "大好き", "。" ], + }, + { + description: "should correctly tokenize a Japanese sentence with quotation marks around", + sentence: { + text: "『犬が大好き\u3002』", + sourceCodeRange: { startOffset: 0, endOffset: 8 }, + }, + expected: [ "『", "犬", "が", "大好き", "。", "』" ], + }, +]; + +describe.each( testcases )( "splitIntoTokensCustom for Japanese: %p", ( { description, sentence, expected } ) => { + it( description, () => { + const tokens = splitIntoTokensCustom( sentence ); + expect( tokens ).toEqual( expected ); + } ); +} ); diff --git a/packages/yoastseo/spec/parse/build/buildSpec.js b/packages/yoastseo/spec/parse/build/buildSpec.js index f748f8ef880..4669044bbe6 100644 --- a/packages/yoastseo/spec/parse/build/buildSpec.js +++ b/packages/yoastseo/spec/parse/build/buildSpec.js @@ -2,6 +2,7 @@ import build from "../../../src/parse/build/build"; import LanguageProcessor from "../../../src/parse/language/LanguageProcessor"; import Factory from "../../specHelpers/factory"; import memoizedSentenceTokenizer from "../../../src/languageProcessing/helpers/sentence/memoizedSentenceTokenizer"; +import splitIntoTokensCustom from "../../../src/languageProcessing/languages/ja/helpers/splitIntoTokensCustom"; describe( "The parse function", () => { it( "parses a basic HTML text", () => { @@ -67,6 +68,66 @@ describe( "The parse function", () => { } ); } ); + it( "parses a basic Japanese HTML text", () => { + const html = "

犬が大好き

"; + + const researcher = Factory.buildMockResearcher( {}, true, false, false, + { splitIntoTokensCustom: splitIntoTokensCustom, memoizedTokenizer: memoizedSentenceTokenizer } ); + const languageProcessor = new LanguageProcessor( researcher ); + expect( build( html, languageProcessor ) ).toEqual( { + name: "#document-fragment", + attributes: {}, + childNodes: [ { + name: "div", + sourceCodeLocation: { + startOffset: 0, + endOffset: 37, + startTag: { + startOffset: 0, + endOffset: 5, + }, + endTag: { + startOffset: 31, + endOffset: 37, + }, + }, + attributes: {}, + childNodes: [ { + name: "p", + isImplicit: false, + attributes: { + "class": new Set( [ "yoast" ] ), + }, + sentences: [ { + text: "犬が大好き", + sourceCodeRange: { startOffset: 22, endOffset: 27 }, + tokens: [ + { text: "犬", sourceCodeRange: { startOffset: 22, endOffset: 23 } }, + { text: "が", sourceCodeRange: { startOffset: 23, endOffset: 24 } }, + { text: "大好き", sourceCodeRange: { startOffset: 24, endOffset: 27 } }, + ], + } ], + childNodes: [ { + name: "#text", + value: "犬が大好き", + } ], + sourceCodeLocation: { + startOffset: 5, + endOffset: 31, + startTag: { + startOffset: 5, + endOffset: 22, + }, + endTag: { + startOffset: 27, + endOffset: 31, + }, + }, + } ], + } ], + } ); + } ); + it( "adds implicit paragraphs around phrasing content outside of paragraphs and headings", () => { const html = "
Hello World!
"; diff --git a/packages/yoastseo/spec/parse/build/private/tokenizeSpec.js b/packages/yoastseo/spec/parse/build/private/tokenizeSpec.js index 12651e4c272..a08cab6160c 100644 --- a/packages/yoastseo/spec/parse/build/private/tokenizeSpec.js +++ b/packages/yoastseo/spec/parse/build/private/tokenizeSpec.js @@ -1,6 +1,7 @@ import tokenize from "../../../../src/parse/build/private/tokenize"; import Paper from "../../../../src/values/Paper"; import EnglishResearcher from "../../../../src/languageProcessing/languages/en/Researcher"; +import JapaneseResearcher from "../../../../src/languageProcessing/languages/ja/Researcher"; import { buildTreeNoTokenize } from "../../../specHelpers/parse/buildTree"; import LanguageProcessor from "../../../../src/parse/language/LanguageProcessor"; @@ -192,3 +193,81 @@ describe( "A test for the tokenize function", function() { } ); } ); } ); + +describe( "A test for tokenizing a japanese sentence", function() { + it( "should correctly tokenize a simple Japanse sentence.", function() { + const mockPaper = new Paper( "

犬が大好き\u3002

", { locale: "ja_JP" } ); + const mockResearcher = new JapaneseResearcher( mockPaper ); + const languageProcessor = new LanguageProcessor( mockResearcher ); + buildTreeNoTokenize( mockPaper ); + // eslint-disable-next-line max-len + expect( tokenize( mockPaper.getTree(), languageProcessor ) ).toEqual( { + attributes: {}, + childNodes: [ + { + attributes: {}, + childNodes: [ + { + name: "#text", + value: "犬が大好き。", + }, + ], + isImplicit: false, + name: "p", + sentences: [ + { + sourceCodeRange: { + startOffset: 3, + endOffset: 9, + }, + text: "犬が大好き。", + tokens: [ + { + sourceCodeRange: { + startOffset: 3, + endOffset: 4, + }, + text: "犬", + }, + { + sourceCodeRange: { + startOffset: 4, + endOffset: 5, + }, + text: "が", + }, + { + sourceCodeRange: { + startOffset: 5, + endOffset: 8, + }, + text: "大好き", + }, + { + sourceCodeRange: { + startOffset: 8, + endOffset: 9, + }, + text: "。", + }, + ], + }, + ], + sourceCodeLocation: { + startOffset: 0, + endOffset: 13, + startTag: { + startOffset: 0, + endOffset: 3, + }, + endTag: { + startOffset: 9, + endOffset: 13, + }, + }, + }, + ], + name: "#document-fragment", + } ); + } ); +} ); diff --git a/packages/yoastseo/spec/parse/language/LanguageProcessorSpec.js b/packages/yoastseo/spec/parse/language/LanguageProcessorSpec.js index 460dbddec53..fadd25f9ddf 100644 --- a/packages/yoastseo/spec/parse/language/LanguageProcessorSpec.js +++ b/packages/yoastseo/spec/parse/language/LanguageProcessorSpec.js @@ -2,6 +2,7 @@ import LanguageProcessor from "../../../src/parse/language/LanguageProcessor"; import Factory from "../../specHelpers/factory"; import memoizedSentenceTokenizer from "../../../src/languageProcessing/helpers/sentence/memoizedSentenceTokenizer"; import Sentence from "../../../src/parse/structure/Sentence"; +import splitIntoTokensCustom from "../../../src/languageProcessing/languages/ja/helpers/splitIntoTokensCustom"; const researcher = Factory.buildMockResearcher( {}, true, false, false, { memoizedTokenizer: memoizedSentenceTokenizer } ); @@ -432,3 +433,23 @@ describe.each( splitIntoTokensTestCases )( "A test for the tokenize method", ( { expect( tokens ).toEqual( expectedTokens ); } ); } ); + +describe( "A test for the splitIntoTokens method in Japanese", () => { + it( "should return an array of tokens", function() { + const japaneseResearcher = Factory.buildMockResearcher( {}, true, false, false, + { splitIntoTokensCustom: splitIntoTokensCustom } ); + const languageProcessor = new LanguageProcessor( japaneseResearcher ); + const tokens = languageProcessor.splitIntoTokens( new Sentence( "ウクライナは、東ヨーロッパに位置する国家。" ) ); + expect( tokens ).toEqual( [ + { text: "ウクライナ", sourceCodeRange: {} }, + { text: "は", sourceCodeRange: {} }, + { text: "、", sourceCodeRange: {} }, + { text: "東ヨーロッパ", sourceCodeRange: {} }, + { text: "に", sourceCodeRange: {} }, + { text: "位置", sourceCodeRange: {} }, + { text: "する", sourceCodeRange: {} }, + { text: "国家", sourceCodeRange: {} }, + { text: "。", sourceCodeRange: {} }, + ] ); + } ); +} ); diff --git a/packages/yoastseo/src/languageProcessing/languages/ja/Researcher.js b/packages/yoastseo/src/languageProcessing/languages/ja/Researcher.js index 4312b87d4bb..10fc57d734c 100644 --- a/packages/yoastseo/src/languageProcessing/languages/ja/Researcher.js +++ b/packages/yoastseo/src/languageProcessing/languages/ja/Researcher.js @@ -10,6 +10,7 @@ import customCountLength from "./helpers/countCharacters"; import matchTransitionWordsHelper from "./helpers/matchTransitionWords"; import getContentWords from "./helpers/getContentWords"; import memoizedTokenizer from "./helpers/memoizedSentenceTokenizer"; +import splitIntoTokensCustom from "./helpers/splitIntoTokensCustom"; // All config import firstWordExceptions from "./config/firstWordExceptions"; @@ -72,6 +73,7 @@ export default class Researcher extends AbstractResearcher { customCountLength, matchTransitionWordsHelper, memoizedTokenizer, + splitIntoTokensCustom, } ); Object.assign( this.defaultResearches, { diff --git a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/splitIntoTokensCustom.js b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/splitIntoTokensCustom.js new file mode 100644 index 00000000000..959bd1fd99c --- /dev/null +++ b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/splitIntoTokensCustom.js @@ -0,0 +1,22 @@ +import { map } from "lodash-es"; +import TinySegmenter from "tiny-segmenter"; + +/** + * Split sentence into tokens. + * + * @param {Sentence} sentence The sentence to split. + * + * @returns {Token[]} The tokens. + */ +function splitIntoTokensCustom( sentence ) { + // Retrieve sentence from sentence class + const sentenceText = sentence.text; + // Return empty string if sentence is empty + if ( sentenceText === "" ) { + return []; + } + // Split sentences into words that are also tokens + const words = new TinySegmenter().segment( sentenceText ); + return map( words ); +} +export default splitIntoTokensCustom; diff --git a/packages/yoastseo/src/parse/build/private/tokenize.js b/packages/yoastseo/src/parse/build/private/tokenize.js index 6f8933fd0ba..ede3adef6f8 100644 --- a/packages/yoastseo/src/parse/build/private/tokenize.js +++ b/packages/yoastseo/src/parse/build/private/tokenize.js @@ -6,12 +6,12 @@ import getTextElementPositions from "./getTextElementPositions"; * * @param {Paragraph|Heading} node The paragraph or heading node to split into sentences. * @param {Sentence} sentence The sentence. - * @param {function} splitIntoTokens The function to use to split the sentence into tokens. + * @param {function} LanguageProcessor The languageprocessor for the current language. * * @returns {Sentence} The sentence, with tokens. */ -function getTokens( node, sentence, splitIntoTokens ) { - sentence.tokens = splitIntoTokens( sentence ); +function getTokens( node, sentence, LanguageProcessor ) { + sentence.tokens = LanguageProcessor.splitIntoTokens( sentence ); sentence.tokens = getTextElementPositions( node, sentence.tokens, sentence.sourceCodeRange.startOffset ); return sentence; } @@ -31,7 +31,7 @@ function getSentences( node, languageProcessor ) { // Add position information to the sentences. sentences = getTextElementPositions( node, sentences ); // Tokenize sentences into tokens. - return sentences.map( sentence => getTokens( node, sentence, languageProcessor.splitIntoTokens ) ); + return sentences.map( sentence => getTokens( node, sentence, languageProcessor ) ); } /** diff --git a/packages/yoastseo/src/parse/language/LanguageProcessor.js b/packages/yoastseo/src/parse/language/LanguageProcessor.js index 68644efefd7..485c87bae91 100644 --- a/packages/yoastseo/src/parse/language/LanguageProcessor.js +++ b/packages/yoastseo/src/parse/language/LanguageProcessor.js @@ -72,6 +72,13 @@ class LanguageProcessor { // Retrieve sentence from sentence class const sentenceText = sentence.text; + // If there is a custom getWords helper use its output for retrieving words/tokens. + const tokenTextsCustom = this.researcher.getHelper( "splitIntoTokensCustom" ); + if ( tokenTextsCustom ) { + const tokensCustom = tokenTextsCustom( sentence ); + return tokensCustom.map( tokenText => new Token( tokenText ) ); + } + // Split the sentence string into tokens. Those tokens are unrefined as they may contain punctuation. const rawTokens = sentenceText.split( wordSeparatorsRegex ).filter( x => x !== "" );