feat: add CSV and TSV column input support

wwb1942 · wwb1942 · commit f64858392e2f · 2026-03-13T19:40:57.000+08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,19 @@ All notable changes to this repository will be documented in this file.
 
 - Ongoing documentation and release polish.
 
+## [v0.5.0] - 2026-03-13
+
+### Added
+
+- `--column COLUMN_NAME` support for reading paper ids / URLs from named columns in `.csv` and `.tsv` input files
+
+### Changed
+
+- plain-text `--input-file` behavior remains line-based and backward compatible
+- structured CSV/TSV inputs now ignore blank rows, comment-only rows, and blank selected cells
+- CSV/TSV files without `--column` now auto-select an input column only when it is unambiguous; otherwise the CLI fails with the available column names
+- README, skill instructions, packaged artifact, and CI smoke coverage now document and verify structured file input handling
+
 ## [v0.4.1] - 2026-03-13
 
 ### Added
diff --git a/README.md b/README.md
@@ -122,13 +122,46 @@ EOF
 python3 scripts/alphaxiv_lookup.py --input-file papers.txt --format brief
 ```
 
+### Batch lookup from CSV with an explicit column
+
+```bash
+cat > papers.csv <<'EOF'
+paper_id,title
+2603.07612,Example Paper
+# comment row,
+,Missing id
+https://arxiv.org/abs/2401.12345,Another Paper
+EOF
+python3 scripts/alphaxiv_lookup.py --input-file papers.csv --column paper_id --format brief
+```
+
+### Batch lookup from TSV with an obvious default column
+
+```bash
+cat > papers.tsv <<'EOF'
+paper_id	title
+2603.07612	Example Paper
+2401.12345	Another Paper
+EOF
+python3 scripts/alphaxiv_lookup.py --input-file papers.tsv --format json-compact
+```
+
 ### Combine `--input-file` with direct arguments
 
 ```bash
 python3 scripts/alphaxiv_lookup.py --input-file papers.txt 'https://www.alphaxiv.org/overview/2501.01234' --format json-compact
 ```
 
-`--input-file PATH` reads one paper id or URL per line, ignores blank lines, ignores lines starting with `#`, and participates in the same single-item vs batch rendering rules as direct positional arguments.
+`--input-file PATH` keeps `.txt` and other non-structured files line-based: one paper id or URL per line, with blank lines and lines starting with `#` ignored.
+
+For `.csv` and `.tsv` files, the CLI reads a header row and then pulls values from a named column:
+
+- use `--column COLUMN_NAME` to select the input column explicitly
+- blank rows, comment-only rows, and rows where the selected column is blank are ignored
+- if `--column` is omitted, the CLI only auto-selects a column when it is unambiguous (for example the file has exactly one column, or exactly one clearly named input column such as `paper_id` or `url`)
+- otherwise it fails clearly and prints the available column names
+
+Structured-file inputs participate in the same single-item vs batch rendering rules as direct positional arguments, and `--input-file` can still be combined with direct ids / URLs in the same command.
 
 ## Output fields
 
@@ -194,6 +227,7 @@ Structure:
 - `--format brief` / `--format brief-zh` prefer the best retrieved summary, but can still produce a useful user-facing brief from the arXiv abstract alone
 - Batch mode accepts multiple ids / URLs in one run and keeps single-item behavior backward compatible
 - `--input-file PATH` can be used more than once and can be combined with direct ids / URLs in the same command
+- `.csv` and `.tsv` inputs support header-based extraction through `--column COLUMN_NAME`, while plain text files keep the existing line-by-line behavior unchanged
 - AlphaXiv is treated as a shortcut, not a replacement for reading the full paper when exact details matter
 
 ## License
diff --git a/SKILL.md b/SKILL.md
@@ -17,8 +17,12 @@ Prefer alphaXiv first because it often exposes an AI-generated overview that is
    - Accept alphaXiv URLs like `https://www.alphaxiv.org/overview/2401.12345`
 2. Run the bundled script:
    - The script accepts one or more paper ids / URLs in a single invocation.
-   - Use `--input-file PATH` to read one id / URL per line; ignore blank lines and lines starting with `#`.
+   - Use `--input-file PATH` to add repo-local batch inputs.
+   - Plain-text inputs stay line-based: read one id / URL per line, ignoring blank lines and lines starting with `#`.
+   - CSV/TSV inputs use a header row. Prefer `--column COLUMN_NAME` to select the input column explicitly.
+   - If `--column` is omitted for CSV/TSV, the script only auto-selects an obvious single input column; otherwise it fails and prints the available columns.
    - `python3 scripts/alphaxiv_lookup.py "<paper-or-url>" --format markdown`
+   - `python3 scripts/alphaxiv_lookup.py --input-file papers.csv --column paper_id --format json`
    - Use `--format json` for full structured output.
    - Use `--format json-compact` when you want a smaller machine-friendly payload.
    - Use `--format text` for a clean plain-text brief.
diff --git a/dist/alphaxiv-paper-lookup.skill b/dist/alphaxiv-paper-lookup.skill
diff --git a/scripts/alphaxiv_lookup.py b/scripts/alphaxiv_lookup.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import argparse
+import csv
 import html
 import json
 import re
@@ -87,6 +88,22 @@
 )
 
 
+OBVIOUS_INPUT_COLUMN_NAMES = {
+    "paper",
+    "paperid",
+    "paperurl",
+    "arxiv",
+    "arxivid",
+    "arxivurl",
+    "url",
+    "link",
+}
+
+
+class InputFileError(ValueError):
+    pass
+
+
 def fetch(url: str, timeout: int = 25) -> str:
     req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
     with urllib.request.urlopen(req, timeout=timeout) as resp:
@@ -758,7 +775,110 @@ def render_many(results: List[Dict[str, object]], output_format: str) -> str:
     return ("\n\n" + ("=" * 80) + "\n\n").join(blocks) + "\n"
 
 
-def read_input_file(path: str) -> List[str]:
+def canonicalize_column_name(name: str) -> str:
+    return re.sub(r"[\s_-]+", "", name.strip().lower())
+
+
+def nonempty_row_values(row: List[str]) -> List[str]:
+    return [cell.strip() for cell in row if cell and cell.strip()]
+
+
+def is_blank_row(row: List[str]) -> bool:
+    return not nonempty_row_values(row)
+
+
+def is_comment_only_row(row: List[str]) -> bool:
+    values = nonempty_row_values(row)
+    return len(values) == 1 and values[0].startswith("#")
+
+
+def visible_column_names(columns: List[str]) -> List[str]:
+    return [column for column in columns if column]
+
+
+def obvious_input_column_index(columns: List[str]) -> Optional[int]:
+    indexed = [(idx, column) for idx, column in enumerate(columns) if column]
+    if len(indexed) == 1:
+        return indexed[0][0]
+
+    matches = [
+        (idx, column)
+        for idx, column in indexed
+        if canonicalize_column_name(column) in OBVIOUS_INPUT_COLUMN_NAMES
+    ]
+    if len(matches) == 1:
+        return matches[0][0]
+    return None
+
+
+def resolve_structured_input_column(path: str, columns: List[str], column_name: Optional[str]) -> int:
+    indexed = [(idx, column) for idx, column in enumerate(columns) if column]
+    if not indexed:
+        raise InputFileError(f"structured input file '{path}' has an empty header row")
+
+    normalized_to_indexes: Dict[str, List[int]] = {}
+    for idx, column in indexed:
+        normalized_to_indexes.setdefault(canonicalize_column_name(column), []).append(idx)
+
+    available = ", ".join(visible_column_names(columns))
+
+    if column_name:
+        requested = canonicalize_column_name(column_name)
+        matches = normalized_to_indexes.get(requested, [])
+        if not matches:
+            raise InputFileError(
+                f"structured input file '{path}' does not contain column '{column_name}'; available columns: {available}"
+            )
+        if len(matches) > 1:
+            raise InputFileError(
+                f"structured input file '{path}' has multiple columns matching '{column_name}'; available columns: {available}"
+            )
+        return matches[0]
+
+    obvious_index = obvious_input_column_index(columns)
+    if obvious_index is not None:
+        return obvious_index
+
+    raise InputFileError(
+        f"structured input file '{path}' requires --column COLUMN_NAME; available columns: {available}"
+    )
+
+
+def read_structured_input_file(path: str, delimiter: str, column_name: Optional[str]) -> List[str]:
+    papers: List[str] = []
+    with open(path, "r", encoding="utf-8-sig", newline="") as handle:
+        reader = csv.reader(handle, delimiter=delimiter)
+
+        columns: Optional[List[str]] = None
+        for row in reader:
+            if is_blank_row(row) or is_comment_only_row(row):
+                continue
+            columns = [cell.strip() for cell in row]
+            break
+
+        if columns is None:
+            return papers
+
+        column_index = resolve_structured_input_column(path, columns, column_name)
+
+        for row in reader:
+            if is_blank_row(row) or is_comment_only_row(row):
+                continue
+            value = row[column_index].strip() if column_index < len(row) else ""
+            if not value or value.startswith("#"):
+                continue
+            papers.append(value)
+
+    return papers
+
+
+def read_input_file(path: str, column_name: Optional[str] = None) -> List[str]:
+    lowered_path = path.lower()
+    if lowered_path.endswith(".csv"):
+        return read_structured_input_file(path, ",", column_name)
+    if lowered_path.endswith(".tsv"):
+        return read_structured_input_file(path, "\t", column_name)
+
     papers: List[str] = []
     with open(path, "r", encoding="utf-8") as handle:
         for raw_line in handle:
@@ -769,7 +889,7 @@ def read_input_file(path: str) -> List[str]:
     return papers
 
 
-def expand_cli_inputs(argv: List[str]) -> List[str]:
+def expand_cli_inputs(argv: List[str], input_column: Optional[str] = None) -> List[str]:
     papers: List[str] = []
     index = 0
 
@@ -784,20 +904,20 @@ def expand_cli_inputs(argv: List[str]) -> List[str]:
             index += 1
             if index >= len(argv):
                 break
-            papers.extend(read_input_file(argv[index]))
+            papers.extend(read_input_file(argv[index], input_column))
             index += 1
             continue
 
         if token.startswith("--input-file="):
-            papers.extend(read_input_file(token.split("=", 1)[1]))
+            papers.extend(read_input_file(token.split("=", 1)[1], input_column))
             index += 1
             continue
 
-        if token in {"--format", "--timeout"}:
+        if token in {"--column", "--format", "--timeout"}:
             index += 2
             continue
 
-        if token.startswith("--format=") or token.startswith("--timeout="):
+        if token.startswith("--column=") or token.startswith("--format=") or token.startswith("--timeout="):
             index += 1
             continue
 
@@ -921,17 +1041,23 @@ def main(argv: Optional[List[str]] = None) -> int:
         action="append",
         default=[],
         metavar="PATH",
-        help="Read one paper id or URL per line from PATH. Blank lines and lines starting with # are ignored.",
+        help="Read paper ids or URLs from PATH. Text files stay line-based; CSV/TSV files support header-based column selection.",
+    )
+    parser.add_argument(
+        "--column",
+        help="For CSV/TSV --input-file values, read paper ids or URLs from COLUMN_NAME. If omitted, an obvious structured column is used only when it can be chosen unambiguously.",
     )
     parser.add_argument("--format", choices=["json", "json-compact", "markdown", "text", "brief", "brief-zh"], default="json")
     parser.add_argument("--timeout", type=int, default=25, help="HTTP timeout in seconds (default: 25)")
     args = parser.parse_args(argv)
 
     try:
-        papers = expand_cli_inputs(argv)
-    except OSError as err:
+        papers = expand_cli_inputs(argv, input_column=args.column)
+    except (InputFileError, OSError) as err:
         path = err.filename or "<unknown>"
-        parser.error(f"unable to read input file '{path}': {err.strerror or err}")
+        if isinstance(err, OSError):
+            parser.error(f"unable to read input file '{path}': {err.strerror or err}")
+        parser.error(str(err))
 
     if not papers:
         parser.error("provide at least one paper id / URL or --input-file PATH")