Skip to content

Commit 00be40f

Browse files
author
Dong Nguyen
committed
Add JSDocs
1 parent 45955c3 commit 00be40f

12 files changed

Lines changed: 391 additions & 74 deletions

index.d.ts

Lines changed: 113 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,90 +1,140 @@
11
// Type definitions
22

3-
import { IOptions as SanitizeOptions } from "sanitize-html";
3+
import { IOptions as SanitizeOptions } from 'sanitize-html'
44

5+
/**
6+
* Transformation for per-site HTML pre/post processing.
7+
*/
58
export interface Transformation {
6-
patterns: Array<RegExp>,
9+
/** URL regex patterns to match */
10+
patterns: Array<RegExp>
11+
/** Function to pre-process raw HTML before extraction */
712
pre?: (document: Document) => Document
13+
/** Function to post-process extracted article content */
814
post?: (document: Document) => Document
915
}
1016

11-
export function addTransformations(transformations: Array<Transformation>): Number;
12-
export function removeTransformations(options: Array<RegExp>): Number;
13-
14-
export function getSanitizeHtmlOptions(): SanitizeOptions;
15-
export function setSanitizeHtmlOptions(options: SanitizeOptions): void;
16-
1717
/**
18-
* @param input url or html
18+
* Options for the article extraction process.
1919
*/
20-
2120
export interface ParserOptions {
22-
/**
23-
* to estimate time to read.
24-
* Default: 300
25-
*/
21+
/** Words per minute for time-to-read estimation. Default: 300 */
2622
wordsPerMinute?: number
27-
/**
28-
* max num of chars generated for description
29-
* Default: 210
30-
*/
23+
/** Max chars for generated description. Default: 210 */
3124
descriptionTruncateLen?: number
32-
/**
33-
* min num of chars required for description
34-
* Default: 180
35-
*/
25+
/** Min chars required for description. Default: 180 */
3626
descriptionLengthThreshold?: number
37-
/**
38-
* min num of chars required for content
39-
* Default: 200
40-
*/
27+
/** Min chars required for content. Default: 200 */
4128
contentLengthThreshold?: number
4229
}
4330

31+
/**
32+
* Proxy configuration for fetching articles.
33+
*/
4434
export interface ProxyConfig {
45-
target?: string;
46-
headers?: Record<string, string>;
35+
/** Proxy endpoint URL */
36+
target?: string
37+
/** Headers for proxy request */
38+
headers?: Record<string, string>
4739
}
4840

41+
/**
42+
* Options for the HTTP fetch request.
43+
*/
4944
export interface FetchOptions {
50-
/**
51-
* list of request headers
52-
* default: null
53-
*/
54-
headers?: Record<string, string>;
55-
/**
56-
* the values to configure proxy
57-
* default: null
58-
*/
59-
proxy?: ProxyConfig;
60-
61-
/**
62-
* http proxy agent
63-
* default: null
64-
*/
65-
agent?: object;
66-
/**
67-
* signal to terminate request
68-
* default: null
69-
*/
70-
signal?: object;
45+
/** Custom request headers */
46+
headers?: Record<string, string>
47+
/** Proxy configuration */
48+
proxy?: ProxyConfig
49+
/** HTTP proxy agent (e.g. HttpsProxyAgent) */
50+
agent?: object
51+
/** AbortSignal to cancel the request */
52+
signal?: object
7153
}
7254

55+
/**
56+
* Extracted article data structure.
57+
*/
7358
export interface ArticleData {
74-
url?: string;
75-
links?: string[];
76-
title?: string;
77-
description?: string;
78-
image?: string;
79-
favicon?: string;
80-
author?: string;
81-
content?: string;
82-
source?: string;
83-
published?: string;
84-
ttr?: number;
85-
type?: string;
59+
/** Best resolved URL of the article */
60+
url?: string
61+
/** Alternative URLs (canonical, shortlink, etc.) */
62+
links?: string[]
63+
/** Article title */
64+
title?: string
65+
/** Short description or excerpt */
66+
description?: string
67+
/** Main image URL */
68+
image?: string
69+
/** Site favicon URL */
70+
favicon?: string
71+
/** Author name */
72+
author?: string
73+
/** Extracted article HTML content */
74+
content?: string
75+
/** Original publisher/source domain */
76+
source?: string
77+
/** Publication date string */
78+
published?: string
79+
/** Estimated time to read in seconds (0 = unknown) */
80+
ttr?: number
81+
/** Page type (e.g. article) */
82+
type?: string
8683
}
8784

88-
export function extract(input: string, parserOptions?: ParserOptions, fetchOptions?: FetchOptions): Promise<ArticleData|null>;
85+
/**
86+
* Register one or more transformations for per-site HTML processing.
87+
*
88+
* @param transformations - Single transformation or array of transformations
89+
* @returns Number of transformations successfully added
90+
*/
91+
export function addTransformations (transformations: Transformation | Array<Transformation>): number
92+
93+
/**
94+
* Remove transformations matching the given patterns.
95+
* Calling without arguments removes all transformations.
96+
*
97+
* @param patterns - URL patterns to match for removal
98+
* @returns Number of transformations removed
99+
*/
100+
export function removeTransformations (patterns?: Array<RegExp>): number
101+
102+
/**
103+
* Get a copy of the current sanitize-html options.
104+
*/
105+
export function getSanitizeHtmlOptions (): SanitizeOptions
106+
107+
/**
108+
* Update sanitize-html options by merging with the current ones.
109+
*
110+
* @param options - Partial sanitize options to merge
111+
*/
112+
export function setSanitizeHtmlOptions (options: SanitizeOptions): void
89113

90-
export function extractFromHtml(html: string, url?: string, parserOptions?: ParserOptions): Promise<ArticleData|null>;
114+
/**
115+
* Load and extract article data from a URL or HTML string.
116+
*
117+
* @param input - URL or HTML string to extract from
118+
* @param parserOptions - Options for parsing
119+
* @param fetchOptions - Options for HTTP fetch
120+
* @returns Extracted article data or null
121+
*/
122+
export function extract (
123+
input: string,
124+
parserOptions?: ParserOptions,
125+
fetchOptions?: FetchOptions,
126+
): Promise<ArticleData | null>
127+
128+
/**
129+
* Extract article data from an HTML string directly.
130+
*
131+
* @param html - Raw HTML content
132+
* @param url - Source URL for resolving relative links
133+
* @param parserOptions - Options for parsing
134+
* @returns Extracted article data or null
135+
*/
136+
export function extractFromHtml (
137+
html: string,
138+
url?: string,
139+
parserOptions?: ParserOptions,
140+
): Promise<ArticleData | null>

src/config.js

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22

33
import { clone } from '@pwshub/bellajs'
44

5+
/**
6+
* Default sanitize-html options for cleaning extracted article content.
7+
* Defines allowed HTML tags, attributes, and iframe domains.
8+
*
9+
* @type {SanitizeOptions}
10+
*/
511
const sanitizeHtmlOptions = {
612
allowedTags: [
713
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
@@ -10,7 +16,7 @@ const sanitizeHtmlOptions = {
1016
'details', 'summary',
1117
'pre', 'code',
1218
'ul', 'ol', 'li', 'dd', 'dl',
13-
'table', 'th', 'tr', 'td', 'thead', 'tbody', 'tfood',
19+
'table', 'th', 'tr', 'td', 'thead', 'tbody', 'tfoot',
1420
'fieldset', 'legend',
1521
'figure', 'figcaption', 'img', 'picture',
1622
'video', 'audio', 'source',
@@ -53,12 +59,20 @@ const sanitizeHtmlOptions = {
5359
}
5460

5561
/**
56-
* @returns {SanitizeOptions}
62+
* Get a clone of the current sanitize-html options.
63+
*
64+
* @returns {SanitizeOptions} Cloned sanitize options
5765
*/
5866
export const getSanitizeHtmlOptions = () => {
5967
return clone(sanitizeHtmlOptions)
6068
}
6169

70+
/**
71+
* Update sanitize-html options by merging with the current ones.
72+
*
73+
* @param {SanitizeOptions} [opts={}] - Partial options to merge
74+
* @returns {void}
75+
*/
6276
export const setSanitizeHtmlOptions = (opts = {}) => {
6377
Object.keys(opts).forEach((key) => {
6478
sanitizeHtmlOptions[key] = clone(opts[key])

src/main.js

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,21 @@ import parseFromHtml from './utils/parseFromHtml.js'
99
import { getCharset } from './utils/html.js'
1010
import { isValid as isValidUrl } from './utils/linker.js'
1111

12+
/**
13+
* Load and extract article data from a URL or HTML string.
14+
*
15+
* @param {string} input - URL or HTML string to extract from
16+
* @param {ParserOptions} [parserOptions={}] - Options for parsing
17+
* @param {FetchOptions} [fetchOptions={}] - Options for HTTP fetch
18+
* @returns {Promise<ArticleData|null>} Extracted article data or null
19+
*/
1220
export const extract = async (input, parserOptions = {}, fetchOptions = {}) => {
1321
if (!isString(input)) {
1422
throw new Error('Input must be a string')
1523
}
1624

1725
if (!isValidUrl(input)) {
18-
return parseFromHtml(input, null, parserOptions || {})
26+
return parseFromHtml(input, null, parserOptions)
1927
}
2028
const buffer = await retrieve(input, fetchOptions)
2129
const text = buffer ? Buffer.from(buffer).toString().trim() : ''
@@ -25,9 +33,17 @@ export const extract = async (input, parserOptions = {}, fetchOptions = {}) => {
2533
const charset = getCharset(text)
2634
const decoder = new TextDecoder(charset)
2735
const html = decoder.decode(buffer)
28-
return parseFromHtml(html, input, parserOptions || {})
36+
return parseFromHtml(html, input, parserOptions)
2937
}
3038

39+
/**
40+
* Extract article data from an HTML string directly.
41+
*
42+
* @param {string} html - Raw HTML content
43+
* @param {string} [url] - Source URL for resolving relative links
44+
* @param {ParserOptions} [parserOptions={}] - Options for parsing
45+
* @returns {Promise<ArticleData|null>} Extracted article data or null
46+
*/
3147
export const extractFromHtml = async (html, url, parserOptions = {}) => {
3248
return parseFromHtml(html, url, parserOptions)
3349
}

src/utils/extractLdSchema.js

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22

33
import { isArray, isObject, isString } from '@pwshub/bellajs'
44

5+
/**
6+
* Allowed JSON-LD schema types that indicate an article or webpage.
7+
*
8+
* @type {string[]}
9+
*/
510
const typeSchemas = [
611
'aboutpage',
712
'checkoutpage',
@@ -31,6 +36,11 @@ const typeSchemas = [
3136
'medicalscholarlyarticle',
3237
]
3338

39+
/**
40+
* Mapping from entry keys to JSON-LD attribute names.
41+
*
42+
* @type {Object<string, string>}
43+
*/
3444
const attributeLists = {
3545
description: 'description',
3646
image: 'image',
@@ -39,6 +49,12 @@ const attributeLists = {
3949
type: '@type',
4050
}
4151

52+
/**
53+
* Safely parse a JSON string, returning an empty object on failure.
54+
*
55+
* @param {string} text - JSON string to parse
56+
* @returns {Object} Parsed object or empty object
57+
*/
4258
const parseJson = (text) => {
4359
try {
4460
return JSON.parse(text)
@@ -47,6 +63,12 @@ const parseJson = (text) => {
4763
}
4864
}
4965

66+
/**
67+
* Check if the given JSON-LD object has an allowed schema type.
68+
*
69+
* @param {Object} ldJson - Parsed JSON-LD object
70+
* @returns {boolean} True if type is in the allowed list
71+
*/
5072
const isAllowedLdJsonType = (ldJson) => {
5173
const rootLdJsonType = ldJson['@type'] || ''
5274
const arr = isArray(rootLdJsonType) ? rootLdJsonType : [rootLdJsonType]
@@ -67,17 +89,17 @@ export default (document, entry) => {
6789
ldSchemas.forEach(ldSchema => {
6890
const ldJson = parseJson(ldSchema.textContent.replace(/[\n\r\t]/g, ''))
6991
if (ldJson && isAllowedLdJsonType(ldJson)) {
70-
Object.entries(attributeLists).forEach(([key, attr]) => {
92+
for (const [key, attr] of Object.entries(attributeLists)) {
7193
if (!entry[key] || !ldJson[attr]) {
72-
return
94+
continue
7395
}
7496

7597
const keyValue = ldJson[attr]
7698
const val = isArray(keyValue) ? keyValue[0] : isObject(keyValue) ? keyValue?.name || '' : keyValue
7799
if (isString(val) && val !== '') {
78100
entry[key] = val.trim()
79101
}
80-
})
102+
}
81103
}
82104
})
83105

src/utils/extractWithReadability.js

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@ import { Readability } from '@mozilla/readability'
44
import { DOMParser } from 'linkedom'
55
import { isString } from '@pwshub/bellajs'
66

7+
/**
8+
* Extract main article content from HTML using Mozilla Readability.
9+
*
10+
* @param {string} html - Raw HTML content
11+
* @param {string} [url=''] - Source URL for resolving relative paths
12+
* @returns {string|null} Extracted article HTML or null
13+
*/
714
export default (html, url = '') => {
815
if (!isString(html)) {
916
return null
@@ -19,6 +26,12 @@ export default (html, url = '') => {
1926
return result.textContent ? result.content : null
2027
}
2128

29+
/**
30+
* Extract article title from HTML using Mozilla Readability.
31+
*
32+
* @param {string} html - Raw HTML content
33+
* @returns {string|null} Extracted title or null
34+
*/
2235
export function extractTitleWithReadability (html) {
2336
if (!isString(html)) {
2437
return null

src/utils/findDate.js

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ function convertDateFormat (dateString) {
1111

1212
let year, month, day
1313

14-
if (parseInt(parts[0]) > 12) {
14+
if (parts[0].length === 4 || parseInt(parts[0]) > 31) {
15+
[year, month, day] = parts
16+
} else if (parseInt(parts[0]) > 12) {
1517
[day, month, year] = parts
1618
} else {
1719
[month, day, year] = parts

0 commit comments

Comments
 (0)