Skip to content

Commit e13436d

Browse files
authored
Merge pull request #23258 from Yoast/209-html-parser---investigate-when-to-build-the-tree
refactor: optimize HTML tree building to reuse existing trees
2 parents 0b40458 + b837180 commit e13436d

10 files changed

Lines changed: 573 additions & 28 deletions

File tree

packages/js/src/initializers/analysis.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ export function collectData() {
101101
...data,
102102
textTitle: getEditorDataTitle(),
103103
isFrontPage: getIsFrontPage(),
104+
shortcodes: Array.isArray( window.wpseoScriptData.analysis.plugins.shortcodes?.wpseo_shortcode_tags )
105+
? window.wpseoScriptData.analysis.plugins.shortcodes.wpseo_shortcode_tags
106+
: [],
104107
};
105108

106109
const analysisData = applyAnalysisModifications( data );
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/**
2+
* Tests for the shortcode-extraction behaviour of `collectData` in
3+
* src/initializers/analysis.js. `collectData` is the main-thread data source the Insights panel
4+
* uses for its own analysis, so the shortcodes it surfaces must match what the analyze cycle is
5+
* told about — otherwise the Insights and assessment results can drift on shortcode-rich posts.
6+
*/
7+
8+
jest.mock( "@wordpress/data", () => ( {
9+
select: jest.fn(),
10+
dispatch: jest.fn(),
11+
subscribe: jest.fn(),
12+
combineReducers: jest.requireActual( "@wordpress/data" ).combineReducers,
13+
} ) );
14+
15+
jest.mock( "@wordpress/hooks", () => ( {
16+
applyFilters: ( hook, value ) => value,
17+
doAction: jest.fn(),
18+
} ) );
19+
20+
jest.mock( "../../src/initializers/pluggable", () => ( {
21+
applyModifications: ( modification, value ) => value,
22+
} ) );
23+
24+
import { select } from "@wordpress/data";
25+
import { collectData } from "../../src/initializers/analysis";
26+
27+
const originalWpseoScriptData = window.wpseoScriptData;
28+
29+
describe( "collectData — shortcodes extraction", () => {
30+
beforeEach( () => {
31+
select.mockImplementation( () => ( {
32+
getAnalysisData: () => ( { text: "", title: "", description: "" } ),
33+
getEditorDataTitle: () => "",
34+
getIsFrontPage: () => false,
35+
} ) );
36+
} );
37+
38+
afterAll( () => {
39+
window.wpseoScriptData = originalWpseoScriptData;
40+
} );
41+
42+
it( "passes the wpseo_shortcode_tags array through when it is present", () => {
43+
window.wpseoScriptData = {
44+
// eslint-disable-next-line camelcase
45+
analysis: { plugins: { shortcodes: { wpseo_shortcode_tags: [ "gallery", "caption" ] } } },
46+
};
47+
48+
expect( collectData().shortcodes ).toEqual( [ "gallery", "caption" ] );
49+
} );
50+
51+
it( "falls back to an empty array when plugins.shortcodes is missing", () => {
52+
window.wpseoScriptData = { analysis: { plugins: {} } };
53+
54+
expect( collectData().shortcodes ).toEqual( [] );
55+
} );
56+
57+
it( "falls back to an empty array when wpseo_shortcode_tags is missing on the shortcodes object", () => {
58+
window.wpseoScriptData = { analysis: { plugins: { shortcodes: {} } } };
59+
60+
expect( collectData().shortcodes ).toEqual( [] );
61+
} );
62+
63+
it( "falls back to an empty array when wpseo_shortcode_tags is null", () => {
64+
window.wpseoScriptData = {
65+
// eslint-disable-next-line camelcase
66+
analysis: { plugins: { shortcodes: { wpseo_shortcode_tags: null } } },
67+
};
68+
69+
expect( collectData().shortcodes ).toEqual( [] );
70+
} );
71+
72+
it( "falls back to an empty array when wpseo_shortcode_tags is a non-array value", () => {
73+
window.wpseoScriptData = {
74+
// eslint-disable-next-line camelcase
75+
analysis: { plugins: { shortcodes: { wpseo_shortcode_tags: { gallery: true } } } },
76+
};
77+
78+
expect( collectData().shortcodes ).toEqual( [] );
79+
} );
80+
} );

packages/yoastseo/spec/languageProcessing/researches/keyphraseDistributionSpec.js

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import JapaneseResearcher from "../../../src/languageProcessing/languages/ja/Res
1313
import getMorphologyData from "../../specHelpers/getMorphologyData";
1414
import { realWorldULExample1, realWorldULExample2 } from "../helpers/sanitize/mergeListItemsSpec";
1515
import buildTree from "../../specHelpers/parse/buildTree";
16+
import getSentencesFromTree from "../../../src/languageProcessing/helpers/sentence/getSentencesFromTree";
1617
import { primeLanguageSpecificData } from "../../../src/languageProcessing/helpers/morphology/buildTopicStems";
1718

1819
const morphologyData = getMorphologyData( "en" );
@@ -1460,6 +1461,71 @@ describe( "Test for the research", function() {
14601461
expect( keyphraseDistributionResearcher( paperWithList, researcherListCondition ).keyphraseDistractionPercentage ).toEqual(
14611462
keyphraseDistributionResearcher( paperWithWords, researcherWordsCondition ).keyphraseDistractionPercentage );
14621463
} );
1464+
1465+
describe( "leaves no sentenceParentNode back-references on the tree", () => {
1466+
// The research uses getSentencesFromTree( tree, true ) which sets `sentence.sentenceParentNode = parentNode`
1467+
// on every sentence in the tree. The parent paragraph in turn holds the sentence in its `sentences` array,
1468+
// which creates a paragraph -> sentence -> sentenceParentNode -> paragraph cycle. When the tree is reused
1469+
// across calls (post tree-build dedup), that cycle escapes into downstream research/analyze results and
1470+
// blows Transporter.serialize. Assert the research cleans up after itself so the tree stays acyclic.
1471+
it( "strips sentenceParentNode from tree sentences after running on a paragraph-only paper", () => {
1472+
const paper = new Paper(
1473+
"<p>The keyphrase appears here. The keyphrase appears again.</p>" +
1474+
"<p>This is a distinct paragraph without the keyphrase. Another sentence.</p>",
1475+
{ locale: "en_US", keyword: "keyphrase" }
1476+
);
1477+
const researcher = new Researcher( paper );
1478+
buildTree( paper, researcher );
1479+
researcher.addResearchData( "morphology", morphologyData );
1480+
1481+
keyphraseDistributionResearcher( paper, researcher );
1482+
1483+
const sentences = getSentencesFromTree( paper.getTree() );
1484+
expect( sentences.length ).toBeGreaterThan( 0 );
1485+
sentences.forEach( sentence => {
1486+
expect( sentence.sentenceParentNode ).toBeUndefined();
1487+
} );
1488+
} );
1489+
1490+
it( "strips sentenceParentNode from tree sentences after running on a paper with list items", () => {
1491+
// The list-items code path merges sentences and writes sentenceParentNode (as an array) on the merged
1492+
// sentence as well. The tree's original sentences also carry the back-ref and must be cleaned up.
1493+
const paper = new Paper(
1494+
"<p>The keyphrase intro paragraph.</p>" +
1495+
"<ul><li>First list item with the keyphrase.</li><li>Second list item without it.</li></ul>",
1496+
{ locale: "en_US", keyword: "keyphrase" }
1497+
);
1498+
const researcher = new Researcher( paper );
1499+
buildTree( paper, researcher );
1500+
researcher.addResearchData( "morphology", morphologyData );
1501+
1502+
keyphraseDistributionResearcher( paper, researcher );
1503+
1504+
const sentences = getSentencesFromTree( paper.getTree() );
1505+
expect( sentences.length ).toBeGreaterThan( 0 );
1506+
sentences.forEach( sentence => {
1507+
expect( sentence.sentenceParentNode ).toBeUndefined();
1508+
} );
1509+
} );
1510+
1511+
it( "leaves the tree JSON-stringifiable (no cycle) after running", () => {
1512+
// JSON.stringify throws "Converting circular structure to JSON" on cycles, which is exactly the failure
1513+
// mode the bug produced downstream. A successful stringify is the strongest end-to-end guarantee that
1514+
// the cleanup actually broke the cycle, independent of which specific property held the back-reference.
1515+
const paper = new Paper(
1516+
"<p>The keyphrase appears here. The keyphrase appears again.</p>" +
1517+
"<ul><li>First list item with the keyphrase.</li><li>Second list item without it.</li></ul>",
1518+
{ locale: "en_US", keyword: "keyphrase" }
1519+
);
1520+
const researcher = new Researcher( paper );
1521+
buildTree( paper, researcher );
1522+
researcher.addResearchData( "morphology", morphologyData );
1523+
1524+
keyphraseDistributionResearcher( paper, researcher );
1525+
1526+
expect( () => JSON.stringify( paper.getTree() ) ).not.toThrow();
1527+
} );
1528+
} );
14631529
} );
14641530

14651531
describe( "a test for exact match of keyphrase in English", () => {

packages/yoastseo/spec/scoring/assessors/assessorSpec.js

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,38 @@ describe( "an assessor object", function() {
134134
} );
135135
} );
136136

137+
describe( "tree building", function() {
138+
it( "builds the HTML tree exactly once when the paper does not already have one", function() {
139+
const paper = new Paper( "<p>Hello world.</p>" );
140+
const assessor = new Assessor( new DefaultResearcher() );
141+
142+
expect( paper.getTree() ).toBeNull();
143+
const setTreeSpy = jest.spyOn( paper, "setTree" );
144+
145+
assessor.assess( paper );
146+
147+
expect( setTreeSpy ).toHaveBeenCalledTimes( 1 );
148+
expect( paper.getTree() ).not.toBeNull();
149+
} );
150+
151+
it( "does not rebuild the HTML tree when the paper already has one", function() {
152+
const paper = new Paper( "<p>Hello world.</p>" );
153+
const assessor = new Assessor( new DefaultResearcher() );
154+
155+
// Populate the tree once via a first assess() call so it's a real, valid tree.
156+
assessor.assess( paper );
157+
const existingTree = paper.getTree();
158+
expect( existingTree ).not.toBeNull();
159+
160+
const setTreeSpy = jest.spyOn( paper, "setTree" );
161+
162+
assessor.assess( paper );
163+
164+
expect( setTreeSpy ).not.toHaveBeenCalled();
165+
expect( paper.getTree() ).toBe( existingTree );
166+
} );
167+
} );
168+
137169
describe( "hasMarker", function() {
138170
let assessor;
139171

packages/yoastseo/spec/values/PaperSpec.js

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,68 @@ describe( "Paper", function() {
197197
} );
198198
} );
199199

200+
describe( "hasSameTreeInputsAs", function() {
201+
it( "returns true for two papers with identical text and tree-relevant attributes", function() {
202+
const paper1 = new Paper( "<p>Hello world.</p>" );
203+
const paper2 = new Paper( "<p>Hello world.</p>" );
204+
expect( paper1.hasSameTreeInputsAs( paper2 ) ).toBe( true );
205+
} );
206+
207+
it( "returns true when only non-tree attributes (keyword, title, description, …) differ", function() {
208+
const paper1 = new Paper( "<p>Hello world.</p>", { keyword: "alpha", title: "Title A", description: "Desc A" } );
209+
const paper2 = new Paper( "<p>Hello world.</p>", { keyword: "beta", title: "Title B", description: "Desc B" } );
210+
expect( paper1.hasSameTreeInputsAs( paper2 ) ).toBe( true );
211+
} );
212+
213+
it( "returns false when the text differs", function() {
214+
const paper1 = new Paper( "<p>Hello world.</p>" );
215+
const paper2 = new Paper( "<p>Goodbye world.</p>" );
216+
expect( paper1.hasSameTreeInputsAs( paper2 ) ).toBe( false );
217+
} );
218+
219+
it( "returns false when the locale differs", function() {
220+
const paper1 = new Paper( "<p>Hello world.</p>", { locale: "en_US" } );
221+
const paper2 = new Paper( "<p>Hello world.</p>", { locale: "nl_NL" } );
222+
expect( paper1.hasSameTreeInputsAs( paper2 ) ).toBe( false );
223+
} );
224+
225+
it( "returns false when the shortcodes list differs", function() {
226+
const paper1 = new Paper( "<p>Hello world.</p>", { shortcodes: [ "gallery" ] } );
227+
const paper2 = new Paper( "<p>Hello world.</p>", { shortcodes: [ "gallery", "caption" ] } );
228+
expect( paper1.hasSameTreeInputsAs( paper2 ) ).toBe( false );
229+
} );
230+
231+
it( "returns false when the wpBlocks list differs", function() {
232+
const paper1 = new Paper( "<p>Hello world.</p>", { wpBlocks: [ { name: "core/paragraph" } ] } );
233+
const paper2 = new Paper( "<p>Hello world.</p>", { wpBlocks: [ { name: "core/heading" } ] } );
234+
expect( paper1.hasSameTreeInputsAs( paper2 ) ).toBe( false );
235+
} );
236+
237+
it( "returns false when compared against a falsy value", function() {
238+
const paper = new Paper( "<p>Hello world.</p>" );
239+
expect( paper.hasSameTreeInputsAs( null ) ).toBe( false );
240+
} );
241+
242+
it( "treats a missing shortcodes attribute as equivalent to an empty array", function() {
243+
// A paper deserialized from a payload that omits `shortcodes` ends up with the attribute undefined,
244+
// but it produces the same tree as a paper that explicitly carries shortcodes: [], so the predicate
245+
// must not report them as different tree inputs.
246+
const paper1 = new Paper( "<p>Hello world.</p>", { shortcodes: [] } );
247+
const paper2 = new Paper( "<p>Hello world.</p>" );
248+
delete paper2._attributes.shortcodes;
249+
expect( paper1.hasSameTreeInputsAs( paper2 ) ).toBe( true );
250+
expect( paper2.hasSameTreeInputsAs( paper1 ) ).toBe( true );
251+
} );
252+
253+
it( "treats a missing wpBlocks attribute as equivalent to an empty array", function() {
254+
const paper1 = new Paper( "<p>Hello world.</p>", { wpBlocks: [] } );
255+
const paper2 = new Paper( "<p>Hello world.</p>" );
256+
delete paper2._attributes.wpBlocks;
257+
expect( paper1.hasSameTreeInputsAs( paper2 ) ).toBe( true );
258+
expect( paper2.hasSameTreeInputsAs( paper1 ) ).toBe( true );
259+
} );
260+
} );
261+
200262
describe( "hasText", function() {
201263
it( "should return true if contains raw text", function() {
202264
const paper = new Paper( "This is a test" );

0 commit comments

Comments
 (0)