|
1 | 1 | // Type definitions |
2 | 2 |
|
3 | | -import { IOptions as SanitizeOptions } from "sanitize-html"; |
| 3 | +import { IOptions as SanitizeOptions } from 'sanitize-html' |
4 | 4 |
|
| 5 | +/** |
| 6 | + * Transformation for per-site HTML pre/post processing. |
| 7 | + */ |
5 | 8 | export interface Transformation { |
6 | | - patterns: Array<RegExp>, |
| 9 | + /** URL regex patterns to match */ |
| 10 | + patterns: Array<RegExp> |
| 11 | + /** Function to pre-process raw HTML before extraction */ |
7 | 12 | pre?: (document: Document) => Document |
| 13 | + /** Function to post-process extracted article content */ |
8 | 14 | post?: (document: Document) => Document |
9 | 15 | } |
10 | 16 |
|
11 | | -export function addTransformations(transformations: Array<Transformation>): Number; |
12 | | -export function removeTransformations(options: Array<RegExp>): Number; |
13 | | - |
14 | | -export function getSanitizeHtmlOptions(): SanitizeOptions; |
15 | | -export function setSanitizeHtmlOptions(options: SanitizeOptions): void; |
16 | | - |
17 | 17 | /** |
18 | | - * @param input url or html |
| 18 | + * Options for the article extraction process. |
19 | 19 | */ |
20 | | - |
21 | 20 | export interface ParserOptions { |
22 | | - /** |
23 | | - * to estimate time to read. |
24 | | - * Default: 300 |
25 | | - */ |
| 21 | + /** Words per minute for time-to-read estimation. Default: 300 */ |
26 | 22 | wordsPerMinute?: number |
27 | | - /** |
28 | | - * max num of chars generated for description |
29 | | - * Default: 210 |
30 | | - */ |
| 23 | + /** Max chars for generated description. Default: 210 */ |
31 | 24 | descriptionTruncateLen?: number |
32 | | - /** |
33 | | - * min num of chars required for description |
34 | | - * Default: 180 |
35 | | - */ |
| 25 | + /** Min chars required for description. Default: 180 */ |
36 | 26 | descriptionLengthThreshold?: number |
37 | | - /** |
38 | | - * min num of chars required for content |
39 | | - * Default: 200 |
40 | | - */ |
| 27 | + /** Min chars required for content. Default: 200 */ |
41 | 28 | contentLengthThreshold?: number |
42 | 29 | } |
43 | 30 |
|
| 31 | +/** |
| 32 | + * Proxy configuration for fetching articles. |
| 33 | + */ |
44 | 34 | export interface ProxyConfig { |
45 | | - target?: string; |
46 | | - headers?: Record<string, string>; |
| 35 | + /** Proxy endpoint URL */ |
| 36 | + target?: string |
| 37 | + /** Headers for proxy request */ |
| 38 | + headers?: Record<string, string> |
47 | 39 | } |
48 | 40 |
|
| 41 | +/** |
| 42 | + * Options for the HTTP fetch request. |
| 43 | + */ |
49 | 44 | export interface FetchOptions { |
50 | | - /** |
51 | | - * list of request headers |
52 | | - * default: null |
53 | | - */ |
54 | | - headers?: Record<string, string>; |
55 | | - /** |
56 | | - * the values to configure proxy |
57 | | - * default: null |
58 | | - */ |
59 | | - proxy?: ProxyConfig; |
60 | | - |
61 | | - /** |
62 | | - * http proxy agent |
63 | | - * default: null |
64 | | - */ |
65 | | - agent?: object; |
66 | | - /** |
67 | | - * signal to terminate request |
68 | | - * default: null |
69 | | - */ |
70 | | - signal?: object; |
| 45 | + /** Custom request headers */ |
| 46 | + headers?: Record<string, string> |
| 47 | + /** Proxy configuration */ |
| 48 | + proxy?: ProxyConfig |
| 49 | + /** HTTP proxy agent (e.g. HttpsProxyAgent) */ |
| 50 | + agent?: object |
| 51 | + /** AbortSignal to cancel the request */ |
| 52 | + signal?: object |
71 | 53 | } |
72 | 54 |
|
| 55 | +/** |
| 56 | + * Extracted article data structure. |
| 57 | + */ |
73 | 58 | export interface ArticleData { |
74 | | - url?: string; |
75 | | - links?: string[]; |
76 | | - title?: string; |
77 | | - description?: string; |
78 | | - image?: string; |
79 | | - favicon?: string; |
80 | | - author?: string; |
81 | | - content?: string; |
82 | | - source?: string; |
83 | | - published?: string; |
84 | | - ttr?: number; |
85 | | - type?: string; |
| 59 | + /** Best resolved URL of the article */ |
| 60 | + url?: string |
| 61 | + /** Alternative URLs (canonical, shortlink, etc.) */ |
| 62 | + links?: string[] |
| 63 | + /** Article title */ |
| 64 | + title?: string |
| 65 | + /** Short description or excerpt */ |
| 66 | + description?: string |
| 67 | + /** Main image URL */ |
| 68 | + image?: string |
| 69 | + /** Site favicon URL */ |
| 70 | + favicon?: string |
| 71 | + /** Author name */ |
| 72 | + author?: string |
| 73 | + /** Extracted article HTML content */ |
| 74 | + content?: string |
| 75 | + /** Original publisher/source domain */ |
| 76 | + source?: string |
| 77 | + /** Publication date string */ |
| 78 | + published?: string |
| 79 | + /** Estimated time to read in seconds (0 = unknown) */ |
| 80 | + ttr?: number |
| 81 | + /** Page type (e.g. article) */ |
| 82 | + type?: string |
86 | 83 | } |
87 | 84 |
|
88 | | -export function extract(input: string, parserOptions?: ParserOptions, fetchOptions?: FetchOptions): Promise<ArticleData|null>; |
| 85 | +/** |
| 86 | + * Register one or more transformations for per-site HTML processing. |
| 87 | + * |
| 88 | + * @param transformations - Single transformation or array of transformations |
| 89 | + * @returns Number of transformations successfully added |
| 90 | + */ |
| 91 | +export function addTransformations (transformations: Transformation | Array<Transformation>): number |
| 92 | + |
| 93 | +/** |
| 94 | + * Remove transformations matching the given patterns. |
| 95 | + * Calling without arguments removes all transformations. |
| 96 | + * |
| 97 | + * @param patterns - URL patterns to match for removal |
| 98 | + * @returns Number of transformations removed |
| 99 | + */ |
| 100 | +export function removeTransformations (patterns?: Array<RegExp>): number |
| 101 | + |
| 102 | +/** |
| 103 | + * Get a copy of the current sanitize-html options. |
| 104 | + */ |
| 105 | +export function getSanitizeHtmlOptions (): SanitizeOptions |
| 106 | + |
| 107 | +/** |
| 108 | + * Update sanitize-html options by merging with the current ones. |
| 109 | + * |
| 110 | + * @param options - Partial sanitize options to merge |
| 111 | + */ |
| 112 | +export function setSanitizeHtmlOptions (options: SanitizeOptions): void |
89 | 113 |
|
90 | | -export function extractFromHtml(html: string, url?: string, parserOptions?: ParserOptions): Promise<ArticleData|null>; |
| 114 | +/** |
| 115 | + * Load and extract article data from a URL or HTML string. |
| 116 | + * |
| 117 | + * @param input - URL or HTML string to extract from |
| 118 | + * @param parserOptions - Options for parsing |
| 119 | + * @param fetchOptions - Options for HTTP fetch |
| 120 | + * @returns Extracted article data or null |
| 121 | + */ |
| 122 | +export function extract ( |
| 123 | + input: string, |
| 124 | + parserOptions?: ParserOptions, |
| 125 | + fetchOptions?: FetchOptions, |
| 126 | +): Promise<ArticleData | null> |
| 127 | + |
| 128 | +/** |
| 129 | + * Extract article data from an HTML string directly. |
| 130 | + * |
| 131 | + * @param html - Raw HTML content |
| 132 | + * @param url - Source URL for resolving relative links |
| 133 | + * @param parserOptions - Options for parsing |
| 134 | + * @returns Extracted article data or null |
| 135 | + */ |
| 136 | +export function extractFromHtml ( |
| 137 | + html: string, |
| 138 | + url?: string, |
| 139 | + parserOptions?: ParserOptions, |
| 140 | +): Promise<ArticleData | null> |
0 commit comments