extractus
diff --git a/‎index.d.ts‎
Lines changed: 113 additions & 63 deletions b/‎index.d.ts‎
Lines changed: 113 additions & 63 deletions
diff --git a/‎src/config.js‎
Lines changed: 16 additions & 2 deletions b/‎src/config.js‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎src/main.js‎
Lines changed: 18 additions & 2 deletions b/‎src/main.js‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎src/utils/extractLdSchema.js‎
Lines changed: 25 additions & 3 deletions b/‎src/utils/extractLdSchema.js‎
Lines changed: 25 additions & 3 deletions
diff --git a/‎src/utils/extractWithReadability.js‎
Lines changed: 13 additions & 0 deletions b/‎src/utils/extractWithReadability.js‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/utils/findDate.js‎
Lines changed: 3 additions & 1 deletion b/‎src/utils/findDate.js‎
Lines changed: 3 additions & 1 deletion
@@ -1,90 +1,140 @@
 // Type definitions
 
-import { IOptions as SanitizeOptions } from "sanitize-html";
+import { IOptions as SanitizeOptions } from 'sanitize-html'
 
+/**
+ * Transformation for per-site HTML pre/post processing.
+ */
 export interface Transformation {
-  patterns: Array<RegExp>,
+  /** URL regex patterns to match */
+  patterns: Array<RegExp>
+  /** Function to pre-process raw HTML before extraction */
   pre?: (document: Document) => Document
+  /** Function to post-process extracted article content */
   post?: (document: Document) => Document
 }
 
-export function addTransformations(transformations: Array<Transformation>): Number;
-export function removeTransformations(options: Array<RegExp>): Number;
-
-export function getSanitizeHtmlOptions(): SanitizeOptions;
-export function setSanitizeHtmlOptions(options: SanitizeOptions): void;
-
 /**
- * @param input url or html
+ * Options for the article extraction process.
  */
-
 export interface ParserOptions {
-  /**
-   * to estimate time to read.
-   * Default: 300
-   */
+  /** Words per minute for time-to-read estimation. Default: 300 */
   wordsPerMinute?: number
-  /**
-   * max num of chars generated for description
-   * Default: 210
-   */
+  /** Max chars for generated description. Default: 210 */
   descriptionTruncateLen?: number
-  /**
-   * min num of chars required for description
-   * Default: 180
-   */
+  /** Min chars required for description. Default: 180 */
   descriptionLengthThreshold?: number
-  /**
-   * min num of chars required for content
-   * Default: 200
-   */
+  /** Min chars required for content. Default: 200 */
   contentLengthThreshold?: number
 }
 
+/**
+ * Proxy configuration for fetching articles.
+ */
 export interface ProxyConfig {
-  target?: string;
-  headers?: Record<string, string>;
+  /** Proxy endpoint URL */
+  target?: string
+  /** Headers for proxy request */
+  headers?: Record<string, string>
 }
 
+/**
+ * Options for the HTTP fetch request.
+ */
 export interface FetchOptions {
-  /**
-   * list of request headers
-   * default: null
-   */
-  headers?: Record<string, string>;
-  /**
-   * the values to configure proxy
-   * default: null
-   */
-  proxy?: ProxyConfig;
-
-  /**
-   * http proxy agent
-   * default: null
-   */
-  agent?: object;
-  /**
-   * signal to terminate request
-   * default: null
-   */
-  signal?: object;
+  /** Custom request headers */
+  headers?: Record<string, string>
+  /** Proxy configuration */
+  proxy?: ProxyConfig
+  /** HTTP proxy agent (e.g. HttpsProxyAgent) */
+  agent?: object
+  /** AbortSignal to cancel the request */
+  signal?: object
 }
 
+/**
+ * Extracted article data structure.
+ */
 export interface ArticleData {
-  url?: string;
-  links?: string[];
-  title?: string;
-  description?: string;
-  image?: string;
-  favicon?: string;
-  author?: string;
-  content?: string;
-  source?: string;
-  published?: string;
-  ttr?: number;
-  type?: string;
+  /** Best resolved URL of the article */
+  url?: string
+  /** Alternative URLs (canonical, shortlink, etc.) */
+  links?: string[]
+  /** Article title */
+  title?: string
+  /** Short description or excerpt */
+  description?: string
+  /** Main image URL */
+  image?: string
+  /** Site favicon URL */
+  favicon?: string
+  /** Author name */
+  author?: string
+  /** Extracted article HTML content */
+  content?: string
+  /** Original publisher/source domain */
+  source?: string
+  /** Publication date string */
+  published?: string
+  /** Estimated time to read in seconds (0 = unknown) */
+  ttr?: number
+  /** Page type (e.g. article) */
+  type?: string
 }
 
-export function extract(input: string, parserOptions?: ParserOptions, fetchOptions?: FetchOptions): Promise<ArticleData|null>;
+/**
+ * Register one or more transformations for per-site HTML processing.
+ *
+ * @param transformations - Single transformation or array of transformations
+ * @returns Number of transformations successfully added
+ */
+export function addTransformations (transformations: Transformation | Array<Transformation>): number
+
+/**
+ * Remove transformations matching the given patterns.
+ * Calling without arguments removes all transformations.
+ *
+ * @param patterns - URL patterns to match for removal
+ * @returns Number of transformations removed
+ */
+export function removeTransformations (patterns?: Array<RegExp>): number
+
+/**
+ * Get a copy of the current sanitize-html options.
+ */
+export function getSanitizeHtmlOptions (): SanitizeOptions
+
+/**
+ * Update sanitize-html options by merging with the current ones.
+ *
+ * @param options - Partial sanitize options to merge
+ */
+export function setSanitizeHtmlOptions (options: SanitizeOptions): void
 
-export function extractFromHtml(html: string, url?: string, parserOptions?: ParserOptions): Promise<ArticleData|null>;
+/**
+ * Load and extract article data from a URL or HTML string.
+ *
+ * @param input - URL or HTML string to extract from
+ * @param parserOptions - Options for parsing
+ * @param fetchOptions - Options for HTTP fetch
+ * @returns Extracted article data or null
+ */
+export function extract (
+  input: string,
+  parserOptions?: ParserOptions,
+  fetchOptions?: FetchOptions,
+): Promise<ArticleData | null>
+
+/**
+ * Extract article data from an HTML string directly.
+ *
+ * @param html - Raw HTML content
+ * @param url - Source URL for resolving relative links
+ * @param parserOptions - Options for parsing
+ * @returns Extracted article data or null
+ */
+export function extractFromHtml (
+  html: string,
+  url?: string,
+  parserOptions?: ParserOptions,
+): Promise<ArticleData | null>
@@ -2,6 +2,12 @@
 
 import { clone } from '@pwshub/bellajs'
 
+/**
+ * Default sanitize-html options for cleaning extracted article content.
+ * Defines allowed HTML tags, attributes, and iframe domains.
+ *
+ * @type {SanitizeOptions}
+ */
 const sanitizeHtmlOptions = {
   allowedTags: [
     'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
@@ -10,7 +16,7 @@ const sanitizeHtmlOptions = {
     'details', 'summary',
     'pre', 'code',
     'ul', 'ol', 'li', 'dd', 'dl',
-    'table', 'th', 'tr', 'td', 'thead', 'tbody', 'tfood',
+    'table', 'th', 'tr', 'td', 'thead', 'tbody', 'tfoot',
     'fieldset', 'legend',
     'figure', 'figcaption', 'img', 'picture',
     'video', 'audio', 'source',
@@ -53,12 +59,20 @@ const sanitizeHtmlOptions = {
 }
 
 /**
- * @returns {SanitizeOptions}
+ * Get a clone of the current sanitize-html options.
+ *
+ * @returns {SanitizeOptions} Cloned sanitize options
  */
 export const getSanitizeHtmlOptions = () => {
   return clone(sanitizeHtmlOptions)
 }
 
+/**
+ * Update sanitize-html options by merging with the current ones.
+ *
+ * @param {SanitizeOptions} [opts={}] - Partial options to merge
+ * @returns {void}
+ */
 export const setSanitizeHtmlOptions = (opts = {}) => {
   Object.keys(opts).forEach((key) => {
     sanitizeHtmlOptions[key] = clone(opts[key])
 
@@ -9,13 +9,21 @@ import parseFromHtml from './utils/parseFromHtml.js'
 import { getCharset } from './utils/html.js'
 import { isValid as isValidUrl } from './utils/linker.js'
 
+/**
+ * Load and extract article data from a URL or HTML string.
+ *
+ * @param {string} input - URL or HTML string to extract from
+ * @param {ParserOptions} [parserOptions={}] - Options for parsing
+ * @param {FetchOptions} [fetchOptions={}] - Options for HTTP fetch
+ * @returns {Promise<ArticleData|null>} Extracted article data or null
+ */
 export const extract = async (input, parserOptions = {}, fetchOptions = {}) => {
   if (!isString(input)) {
     throw new Error('Input must be a string')
   }
 
   if (!isValidUrl(input)) {
-    return parseFromHtml(input, null, parserOptions || {})
+    return parseFromHtml(input, null, parserOptions)
   }
   const buffer = await retrieve(input, fetchOptions)
   const text = buffer ? Buffer.from(buffer).toString().trim() : ''
@@ -25,9 +33,17 @@ export const extract = async (input, parserOptions = {}, fetchOptions = {}) => {
   const charset = getCharset(text)
   const decoder = new TextDecoder(charset)
   const html = decoder.decode(buffer)
-  return parseFromHtml(html, input, parserOptions || {})
+  return parseFromHtml(html, input, parserOptions)
 }
 
+/**
+ * Extract article data from an HTML string directly.
+ *
+ * @param {string} html - Raw HTML content
+ * @param {string} [url] - Source URL for resolving relative links
+ * @param {ParserOptions} [parserOptions={}] - Options for parsing
+ * @returns {Promise<ArticleData|null>} Extracted article data or null
+ */
 export const extractFromHtml = async (html, url, parserOptions = {}) => {
   return parseFromHtml(html, url, parserOptions)
 }
 
@@ -2,6 +2,11 @@
 
 import { isArray, isObject, isString } from '@pwshub/bellajs'
 
+/**
+ * Allowed JSON-LD schema types that indicate an article or webpage.
+ *
+ * @type {string[]}
+ */
 const typeSchemas = [
   'aboutpage',
   'checkoutpage',
@@ -31,6 +36,11 @@ const typeSchemas = [
   'medicalscholarlyarticle',
 ]
 
+/**
+ * Mapping from entry keys to JSON-LD attribute names.
+ *
+ * @type {Object<string, string>}
+ */
 const attributeLists = {
   description: 'description',
   image: 'image',
@@ -39,6 +49,12 @@ const attributeLists = {
   type: '@type',
 }
 
+/**
+ * Safely parse a JSON string, returning an empty object on failure.
+ *
+ * @param {string} text - JSON string to parse
+ * @returns {Object} Parsed object or empty object
+ */
 const parseJson = (text) => {
   try {
     return JSON.parse(text)
@@ -47,6 +63,12 @@ const parseJson = (text) => {
   }
 }
 
+/**
+ * Check if the given JSON-LD object has an allowed schema type.
+ *
+ * @param {Object} ldJson - Parsed JSON-LD object
+ * @returns {boolean} True if type is in the allowed list
+ */
 const isAllowedLdJsonType = (ldJson) => {
   const rootLdJsonType = ldJson['@type'] || ''
   const arr = isArray(rootLdJsonType) ? rootLdJsonType : [rootLdJsonType]
@@ -67,17 +89,17 @@ export default (document, entry) => {
   ldSchemas.forEach(ldSchema => {
     const ldJson = parseJson(ldSchema.textContent.replace(/[\n\r\t]/g, ''))
     if (ldJson && isAllowedLdJsonType(ldJson)) {
-      Object.entries(attributeLists).forEach(([key, attr]) => {
+      for (const [key, attr] of Object.entries(attributeLists)) {
         if (!entry[key] || !ldJson[attr]) {
-          return
+          continue
         }
 
         const keyValue = ldJson[attr]
         const val = isArray(keyValue) ? keyValue[0] : isObject(keyValue) ? keyValue?.name || '' : keyValue
         if (isString(val) && val !== '') {
           entry[key] = val.trim()
         }
-      })
+      }
     }
   })
 
 
@@ -4,6 +4,13 @@ import { Readability } from '@mozilla/readability'
 import { DOMParser } from 'linkedom'
 import { isString } from '@pwshub/bellajs'
 
+/**
+ * Extract main article content from HTML using Mozilla Readability.
+ *
+ * @param {string} html - Raw HTML content
+ * @param {string} [url=''] - Source URL for resolving relative paths
+ * @returns {string|null} Extracted article HTML or null
+ */
 export default (html, url = '') => {
   if (!isString(html)) {
     return null
@@ -19,6 +26,12 @@ export default (html, url = '') => {
   return result.textContent ? result.content : null
 }
 
+/**
+ * Extract article title from HTML using Mozilla Readability.
+ *
+ * @param {string} html - Raw HTML content
+ * @returns {string|null} Extracted title or null
+ */
 export function extractTitleWithReadability (html) {
   if (!isString(html)) {
     return null
 
@@ -11,7 +11,9 @@ function convertDateFormat (dateString) {
 
   let year, month, day
 
-  if (parseInt(parts[0]) > 12) {
+  if (parts[0].length === 4 || parseInt(parts[0]) > 31) {
+    [year, month, day] = parts
+  } else if (parseInt(parts[0]) > 12) {
     [day, month, year] = parts
   } else {
     [month, day, year] = parts