From a45ced89ded7eacf818196e70004341d31d05afd Mon Sep 17 00:00:00 2001
From: Jaroslav Benes <jaroslavbenes@ecoposta.sk>
Date: Wed, 8 Apr 2026 12:25:34 +0200
Subject: [PATCH] Initial commit: llmqt LLM Query Tester

Single-file Python CLI to batch-test multiple LLM models with predefined
queries. Supports YAML/JSON config, reasoning detection (<think> tags and
reasoning_content field), per-query token/speed stats, and graceful API
error handling. Install with `pip install -e .` to get the `llmqt` command.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .gitignore               |  25 ++++
 CLAUDE.md                | 145 ++++++++++++++++++++++
 README.md                |  82 +++++++++++++
 example_system_prompt.md |   1 +
 example_test.yaml        |  14 +++
 llmqt.py                 | 256 +++++++++++++++++++++++++++++++++++++++
 pyproject.toml           |  19 +++
 7 files changed, 542 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 CLAUDE.md
 create mode 100644 README.md
 create mode 100644 example_system_prompt.md
 create mode 100644 example_test.yaml
 create mode 100644 llmqt.py
 create mode 100644 pyproject.toml
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6be6a9f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,25 @@
+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+*.egg
+*.egg-info/
+dist/
+build/
+.eggs/
+.venv/
+venv/
+env/
+
+# llmqt outputs (directories created by the script)
+# Uncomment to ignore all test output dirs:
+# */
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# Env
+.env
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..0be900f
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,145 @@
+# CLAUDE.md — llmqt
+
+## Project overview
+
+`llmqt` (LLM Query Tester) is a single-file Python CLI that batch-tests multiple LLM models
+against a set of queries. Results are written as Markdown files with per-query stats and
+optional reasoning sections.
+
+## Structure
+
+```
+llmqt/
+  llmqt.py                  # entire implementation — single module
+  pyproject.toml            # build/install config; declares `llmqt` console script
+  example_test.yaml         # MUST be kept up to date with every config format change
+  example_system_prompt.md  # system prompt used by example_test.yaml
+  README.md
+  CLAUDE.md
+  .gitignore
+```
+
+## Installation
+
+```bash
+pip install -e .
+```
+
+Registers the `llmqt` entry point from `pyproject.toml` so the command works from any directory.
+
+## CLI signature
+
+```
+llmqt <system_prompt.md> <config1.yaml> [config2.yaml ...]
+```
+
+- **First argument**: path to a `.md` file containing the system prompt (resolved from CWD)
+- **Remaining arguments**: one or more test config files (YAML or JSON)
+
+## Environment variables
+
+| Variable         | Required | Purpose                                              |
+|------------------|----------|------------------------------------------------------|
+| `OPENAI_API_KEY` | Yes      | API key                                              |
+| `OPENAI_API_BASE`| No       | Custom base URL for OpenAI-compatible endpoints      |
+
+`OPENAI_BASE_URL` is also accepted as an alias for `OPENAI_API_BASE`.
+
+## Config file format (YAML or JSON)
+
+**IMPORTANT: whenever the config format changes, update `example_test.yaml` to reflect it.**
+
+The system prompt is **not** part of the config file — it is passed as the first CLI argument.
+
+### YAML example
+
+```yaml
+models:
+  - gpt-4o-mini
+  - gpt-4o
+
+queries:
+  - "First query text"
+  - "Second query text"
+```
+
+### JSON equivalent
+
+```json
+{
+  "models": ["gpt-4o-mini", "gpt-4o"],
+  "queries": ["First query text", "Second query text"]
+}
+```
+
+### Field reference
+
+| Field     | Type            | Description                                          |
+|-----------|-----------------|------------------------------------------------------|
+| `models`  | list of strings | Model names; any OpenAI-compatible identifier        |
+| `queries` | list of strings | Queries sent to each model in listed order           |
+
+## Execution logic
+
+```
+for each config file:
+  for each model:
+    for each query:
+      POST to API (with timing), wait for response
+    write <config_stem>/<model_name>.md  (in CWD)
+```
+
+Output directory is always relative to the **current working directory**, not the config file
+location. This lets the user run `llmqt ~/configs/prompt.md ~/configs/test1.yaml` from any
+writable directory and have outputs land there.
+
+## Filename sanitization
+
+Model names are sanitized for filesystem safety: characters outside `[A-Za-z0-9._- ]` are
+replaced with `_`. E.g. `anthropic/claude-3` → `anthropic_claude-3.md`.
+
+## Reasoning detection
+
+Checked in this order:
+1. `message.reasoning_content` attribute (DeepSeek API / some OpenAI-compatible endpoints)
+2. `<think>...</think>` tags in the response content (DeepSeek R1, QwQ open-source models)
+
+If reasoning is found it is stripped from the answer and rendered in a separate section.
+
+## Output format per model file
+
+```markdown
+# <model name>
+
+**Config:** `test1.yaml`
+
+## Statistics
+
+| Query | Elapsed | Prompt tok | Completion tok | Total tok | tok/s |
+|-------|---------|------------|----------------|-----------|-------|
+| 1     | 1.2s    | 45         | 120            | 165       | 100.0 |
+| Total | 1.2s    | 45         | 120            | 165       | 100.0 |
+
+---
+
+## Query 1
+
+> <query text>
+
+*1.2s · 120 completion tokens · 100.0 tok/s*
+
+### Reasoning        ← only present when reasoning was detected
+
+<reasoning text>
+
+### Response
+
+<answer text>
+
+---
+```
+
+## Dependencies
+
+- `openai >= 1.0.0` — API client
+- `pyyaml >= 6.0` — YAML parsing (imported lazily; JSON works without it)
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4285913
--- /dev/null
+++ b/README.md
@@ -0,0 +1,82 @@
+# llmqt — LLM Query Tester
+
+Batch-test multiple LLM models against a set of queries. Results are saved as nicely formatted Markdown files — one per model — including per-query stats and a summary table.
+
+## Install
+
+```bash
+pip install -e .
+```
+
+This installs the `llmqt` command into your PATH.
+
+## Setup
+
+Export your API credentials:
+
+```bash
+export OPENAI_API_KEY=your_key_here
+export OPENAI_API_BASE=https://your-endpoint/v1  # optional, for custom/local endpoints
+```
+
+## Usage
+
+```bash
+llmqt <system_prompt.md> <config1.yaml> [config2.yaml ...]
+```
+
+Examples:
+
+```bash
+llmqt prompt.md test1.yaml
+llmqt prompt.md test1.yaml test2.yaml test3.json
+```
+
+Outputs are written to `./<config_stem>/<model_name>.md` in the current working directory.
+
+## Config file format
+
+YAML (`.yaml` / `.yml`) and JSON (`.json`) are both supported.
+
+```yaml
+models:
+  - gpt-4o-mini
+  - gpt-4o
+
+queries:
+  - "What is the capital of France?"
+  - "Explain TCP vs UDP."
+  - "Write a Python prime-checker function."
+```
+
+See [example_test.yaml](example_test.yaml) and [example_system_prompt.md](example_system_prompt.md).
+
+## Output format
+
+For `llmqt prompt.md test1.yaml` with models `gpt-4o-mini` and `gpt-4o`:
+
+```
+test1/
+  gpt-4o-mini.md
+  gpt-4o.md
+```
+
+Each file contains:
+
+- A **statistics table** (elapsed time, prompt/completion tokens, tok/s per query + totals)
+- For each query: the query text, per-query stats, optional **Reasoning** section (if the model returns chain-of-thought), and the **Response**
+
+### Reasoning detection
+
+Reasoning content is extracted automatically from:
+- The `reasoning_content` field on the message (DeepSeek API style)
+- `<think>...</think>` tags in the response content (DeepSeek R1 / QwQ open-source style)
+
+## Execution order
+
+```
+for each config file:
+  for each model:
+    for each query → POST to API, wait for response
+    write <config_stem>/<model>.md in CWD
+```
diff --git a/example_system_prompt.md b/example_system_prompt.md
new file mode 100644
index 0000000..da9d152
--- /dev/null
+++ b/example_system_prompt.md
@@ -0,0 +1 @@
+You are a helpful assistant. Answer questions clearly and concisely.
diff --git a/example_test.yaml b/example_test.yaml
new file mode 100644
index 0000000..3712dab
--- /dev/null
+++ b/example_test.yaml
@@ -0,0 +1,14 @@
+# llmqt example config
+# Run with: llmqt example_system_prompt.md example_test.yaml
+# Outputs will be written to ./example_test/<model_name>.md
+
+# List of models to test. Any OpenAI-compatible model name works.
+models:
+  - gpt-4o-mini
+  - gpt-4o
+
+# List of queries to send to each model (in order).
+queries:
+  - "What is the capital of France?"
+  - "Explain the difference between TCP and UDP in simple terms."
+  - "Write a Python function that checks if a number is prime."
diff --git a/llmqt.py b/llmqt.py
new file mode 100644
index 0000000..3b5f8d4
--- /dev/null
+++ b/llmqt.py
@@ -0,0 +1,256 @@
+#!/usr/bin/env python3
+"""llmqt - LLM Query Tester"""
+
+import os
+import re
+import sys
+import json
+import time
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    yaml = None
+
+from openai import OpenAI
+
+
+def load_config(config_path: Path) -> dict:
+    suffix = config_path.suffix.lower()
+    with open(config_path) as f:
+        if suffix in ('.yaml', '.yml'):
+            if yaml is None:
+                print("Error: pyyaml is required for YAML configs. Run: pip install pyyaml")
+                sys.exit(1)
+            return yaml.safe_load(f)
+        elif suffix == '.json':
+            return json.load(f)
+        else:
+            raise ValueError(f"Unsupported config format: {config_path.suffix} (use .yaml, .yml, or .json)")
+
+
+def run_query(
+    client: OpenAI, model: str, system_prompt: str, query: str
+) -> tuple[str | None, str, dict]:
+    """
+    Send a query and return (reasoning, answer, stats).
+
+    stats keys:
+      prompt_tokens, completion_tokens, total_tokens,
+      elapsed_s, tokens_per_sec, error
+    """
+    t0 = time.monotonic()
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": query},
+            ],
+        )
+    except Exception as exc:
+        elapsed = time.monotonic() - t0
+        stats = {
+            "prompt_tokens": 0,
+            "completion_tokens": 0,
+            "total_tokens": 0,
+            "elapsed_s": elapsed,
+            "tokens_per_sec": 0.0,
+            "error": str(exc),
+        }
+        return None, "", stats
+    elapsed = time.monotonic() - t0
+
+    message = response.choices[0].message
+    content = message.content or ""
+
+    # Some APIs (e.g. DeepSeek) expose reasoning_content as a separate field.
+    reasoning = getattr(message, "reasoning_content", None)
+
+    if not reasoning:
+        # Fall back to extracting <think>...</think> from content.
+        think_match = re.search(r"<think>(.*?)</think>", content, re.DOTALL)
+        if think_match:
+            reasoning = think_match.group(1).strip()
+            content = re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL).strip()
+
+    usage = response.usage
+    prompt_tokens = usage.prompt_tokens if usage else 0
+    completion_tokens = usage.completion_tokens if usage else 0
+    total_tokens = usage.total_tokens if usage else 0
+    tps = completion_tokens / elapsed if elapsed > 0 else 0.0
+
+    stats = {
+        "prompt_tokens": prompt_tokens,
+        "completion_tokens": completion_tokens,
+        "total_tokens": total_tokens,
+        "elapsed_s": elapsed,
+        "tokens_per_sec": tps,
+        "error": None,
+    }
+
+    return reasoning or None, content, stats
+
+
+def sanitize_filename(name: str) -> str:
+    """Replace characters that are unsafe in filenames."""
+    return "".join(c if c.isalnum() or c in "._- " else "_" for c in name).strip()
+
+
+def format_stats_inline(stats: dict) -> str:
+    return (
+        f"{stats['elapsed_s']:.1f}s | "
+        f"{stats['completion_tokens']} completion tokens | "
+        f"{stats['tokens_per_sec']:.1f} tok/s"
+    )
+
+
+def format_stats_table(indexed_stats: list[tuple[int, dict]]) -> str:
+    """Render a summary stats table. indexed_stats is [(query_number, stats), ...]."""
+    all_stats = [s for _, s in indexed_stats]
+    total_prompt = sum(s["prompt_tokens"] for s in all_stats)
+    total_comp = sum(s["completion_tokens"] for s in all_stats)
+    total_tok = sum(s["total_tokens"] for s in all_stats)
+    total_elapsed = sum(s["elapsed_s"] for s in all_stats)
+    avg_tps = total_comp / total_elapsed if total_elapsed > 0 else 0.0
+
+    lines = [
+        "| Query | Elapsed | Prompt tok | Completion tok | Total tok | tok/s |",
+        "|-------|---------|------------|----------------|-----------|-------|",
+    ]
+    for i, s in indexed_stats:
+        lines.append(
+            f"| {i} "
+            f"| {s['elapsed_s']:.1f}s "
+            f"| {s['prompt_tokens']} "
+            f"| {s['completion_tokens']} "
+            f"| {s['total_tokens']} "
+            f"| {s['tokens_per_sec']:.1f} |"
+        )
+    lines.append(
+        f"| **Total** "
+        f"| **{total_elapsed:.1f}s** "
+        f"| **{total_prompt}** "
+        f"| **{total_comp}** "
+        f"| **{total_tok}** "
+        f"| **{avg_tps:.1f}** |"
+    )
+    return "\n".join(lines)
+
+
+def process_config(config_path: Path, system_prompt: str, client: OpenAI) -> None:
+    config = load_config(config_path)
+
+    for key in ("models", "queries"):
+        if key not in config:
+            print(f"Error: Missing '{key}' in {config_path}")
+            sys.exit(1)
+
+    models = config["models"]
+    queries = config["queries"]
+
+    output_dir = Path.cwd() / config_path.stem
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for model in models:
+        print(f"\n  Model: {model}")
+        results: list[tuple[str, str | None, str, dict]] = []
+
+        for i, query in enumerate(queries, 1):
+            preview = query[:70] + ("..." if len(query) > 70 else "")
+            print(f"    Query {i}/{len(queries)}: {preview}")
+            reasoning, answer, stats = run_query(client, model, system_prompt, query)
+            results.append((query, reasoning, answer, stats))
+            if stats["error"]:
+                print(f"    -> ERROR: {stats['error']}")
+            else:
+                tag = " [reasoning]" if reasoning else ""
+                print(f"    -> {format_stats_inline(stats)}{tag}")
+
+        model_filename = sanitize_filename(model) + ".md"
+        output_path = output_dir / model_filename
+
+        with open(output_path, "w") as f:
+            f.write(f"# {model}\n\n")
+            f.write(f"**Config:** `{config_path.name}`\n\n")
+
+            # Summary stats table (only successful queries, preserving query number)
+            successful_stats = [(i, s) for i, (_, _, _, s) in enumerate(results, 1) if not s["error"]]
+            if successful_stats:
+                f.write("## Statistics\n\n")
+                f.write(format_stats_table(successful_stats))
+                f.write("\n\n---\n\n")
+
+            for i, (query, reasoning, answer, stats) in enumerate(results, 1):
+                f.write(f"## Query {i}\n\n")
+                f.write(f"> {query}\n\n")
+
+                if stats["error"]:
+                    f.write(f"> [!WARNING]\n> **Error:** {stats['error']}\n\n")
+                    f.write("---\n\n")
+                    continue
+
+                f.write(
+                    f"*{stats['elapsed_s']:.1f}s · "
+                    f"{stats['completion_tokens']} completion tokens · "
+                    f"{stats['tokens_per_sec']:.1f} tok/s*\n\n"
+                )
+
+                if reasoning:
+                    f.write("### Reasoning\n\n")
+                    f.write(f"{reasoning}\n\n")
+
+                f.write("### Response\n\n")
+                f.write(f"{answer}\n\n")
+                f.write("---\n\n")
+
+        print(f"    Saved: {output_path}")
+
+
+def main():
+    if len(sys.argv) < 3:
+        print("Usage: llmqt <system_prompt.md> <config1.yaml> [config2.yaml ...]")
+        print()
+        print("Environment variables:")
+        print("  OPENAI_API_KEY   (required)")
+        print("  OPENAI_API_BASE  (optional, for custom endpoints)")
+        sys.exit(1)
+
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        print("Error: OPENAI_API_KEY environment variable not set.")
+        sys.exit(1)
+
+    api_base = os.environ.get("OPENAI_API_BASE") or os.environ.get("OPENAI_BASE_URL")
+    client_kwargs = {"api_key": api_key}
+    if api_base:
+        client_kwargs["base_url"] = api_base
+
+    client = OpenAI(**client_kwargs)
+
+    prompt_path = Path(sys.argv[1])
+    if not prompt_path.exists():
+        print(f"Error: System prompt file not found: {prompt_path}")
+        sys.exit(1)
+
+    with open(prompt_path) as f:
+        system_prompt = f.read()
+
+    config_paths = []
+    for arg in sys.argv[2:]:
+        p = Path(arg)
+        if not p.exists():
+            print(f"Error: Config file not found: {p}")
+            sys.exit(1)
+        config_paths.append(p)
+
+    for config_path in config_paths:
+        print(f"\nProcessing: {config_path}")
+        process_config(config_path, system_prompt, client)
+
+    print("\nDone.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..56293c2
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,19 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "llmqt"
+version = "0.1.0"
+description = "LLM Query Tester — batch-test multiple models with predefined queries"
+requires-python = ">=3.9"
+dependencies = [
+    "openai>=1.0.0",
+    "pyyaml>=6.0",
+]
+
+[project.scripts]
+llmqt = "llmqt:main"
+
+[tool.setuptools]
+py-modules = ["llmqt"]