From a45ced89ded7eacf818196e70004341d31d05afd Mon Sep 17 00:00:00 2001 From: Jaroslav Benes Date: Wed, 8 Apr 2026 12:25:34 +0200 Subject: [PATCH] Initial commit: llmqt LLM Query Tester Single-file Python CLI to batch-test multiple LLM models with predefined queries. Supports YAML/JSON config, reasoning detection ( tags and reasoning_content field), per-query token/speed stats, and graceful API error handling. Install with `pip install -e .` to get the `llmqt` command. Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 25 ++++ CLAUDE.md | 145 ++++++++++++++++++++++ README.md | 82 +++++++++++++ example_system_prompt.md | 1 + example_test.yaml | 14 +++ llmqt.py | 256 +++++++++++++++++++++++++++++++++++++++ pyproject.toml | 19 +++ 7 files changed, 542 insertions(+) create mode 100644 .gitignore create mode 100644 CLAUDE.md create mode 100644 README.md create mode 100644 example_system_prompt.md create mode 100644 example_test.yaml create mode 100644 llmqt.py create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6be6a9f --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +# Python +__pycache__/ +*.py[cod] +*.pyo +*.egg +*.egg-info/ +dist/ +build/ +.eggs/ +.venv/ +venv/ +env/ + +# llmqt outputs (directories created by the script) +# Uncomment to ignore all test output dirs: +# */ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Env +.env diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..0be900f --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,145 @@ +# CLAUDE.md — llmqt + +## Project overview + +`llmqt` (LLM Query Tester) is a single-file Python CLI that batch-tests multiple LLM models +against a set of queries. Results are written as Markdown files with per-query stats and +optional reasoning sections. + +## Structure + +``` +llmqt/ + llmqt.py # entire implementation — single module + pyproject.toml # build/install config; declares `llmqt` console script + example_test.yaml # MUST be kept up to date with every config format change + example_system_prompt.md # system prompt used by example_test.yaml + README.md + CLAUDE.md + .gitignore +``` + +## Installation + +```bash +pip install -e . +``` + +Registers the `llmqt` entry point from `pyproject.toml` so the command works from any directory. + +## CLI signature + +``` +llmqt [config2.yaml ...] +``` + +- **First argument**: path to a `.md` file containing the system prompt (resolved from CWD) +- **Remaining arguments**: one or more test config files (YAML or JSON) + +## Environment variables + +| Variable | Required | Purpose | +|------------------|----------|------------------------------------------------------| +| `OPENAI_API_KEY` | Yes | API key | +| `OPENAI_API_BASE`| No | Custom base URL for OpenAI-compatible endpoints | + +`OPENAI_BASE_URL` is also accepted as an alias for `OPENAI_API_BASE`. + +## Config file format (YAML or JSON) + +**IMPORTANT: whenever the config format changes, update `example_test.yaml` to reflect it.** + +The system prompt is **not** part of the config file — it is passed as the first CLI argument. + +### YAML example + +```yaml +models: + - gpt-4o-mini + - gpt-4o + +queries: + - "First query text" + - "Second query text" +``` + +### JSON equivalent + +```json +{ + "models": ["gpt-4o-mini", "gpt-4o"], + "queries": ["First query text", "Second query text"] +} +``` + +### Field reference + +| Field | Type | Description | +|-----------|-----------------|------------------------------------------------------| +| `models` | list of strings | Model names; any OpenAI-compatible identifier | +| `queries` | list of strings | Queries sent to each model in listed order | + +## Execution logic + +``` +for each config file: + for each model: + for each query: + POST to API (with timing), wait for response + write /.md (in CWD) +``` + +Output directory is always relative to the **current working directory**, not the config file +location. This lets the user run `llmqt ~/configs/prompt.md ~/configs/test1.yaml` from any +writable directory and have outputs land there. + +## Filename sanitization + +Model names are sanitized for filesystem safety: characters outside `[A-Za-z0-9._- ]` are +replaced with `_`. E.g. `anthropic/claude-3` → `anthropic_claude-3.md`. + +## Reasoning detection + +Checked in this order: +1. `message.reasoning_content` attribute (DeepSeek API / some OpenAI-compatible endpoints) +2. `...` tags in the response content (DeepSeek R1, QwQ open-source models) + +If reasoning is found it is stripped from the answer and rendered in a separate section. + +## Output format per model file + +```markdown +# + +**Config:** `test1.yaml` + +## Statistics + +| Query | Elapsed | Prompt tok | Completion tok | Total tok | tok/s | +|-------|---------|------------|----------------|-----------|-------| +| 1 | 1.2s | 45 | 120 | 165 | 100.0 | +| Total | 1.2s | 45 | 120 | 165 | 100.0 | + +--- + +## Query 1 + +> + +*1.2s · 120 completion tokens · 100.0 tok/s* + +### Reasoning ← only present when reasoning was detected + + + +### Response + + + +--- +``` + +## Dependencies + +- `openai >= 1.0.0` — API client +- `pyyaml >= 6.0` — YAML parsing (imported lazily; JSON works without it) diff --git a/README.md b/README.md new file mode 100644 index 0000000..4285913 --- /dev/null +++ b/README.md @@ -0,0 +1,82 @@ +# llmqt — LLM Query Tester + +Batch-test multiple LLM models against a set of queries. Results are saved as nicely formatted Markdown files — one per model — including per-query stats and a summary table. + +## Install + +```bash +pip install -e . +``` + +This installs the `llmqt` command into your PATH. + +## Setup + +Export your API credentials: + +```bash +export OPENAI_API_KEY=your_key_here +export OPENAI_API_BASE=https://your-endpoint/v1 # optional, for custom/local endpoints +``` + +## Usage + +```bash +llmqt [config2.yaml ...] +``` + +Examples: + +```bash +llmqt prompt.md test1.yaml +llmqt prompt.md test1.yaml test2.yaml test3.json +``` + +Outputs are written to `.//.md` in the current working directory. + +## Config file format + +YAML (`.yaml` / `.yml`) and JSON (`.json`) are both supported. + +```yaml +models: + - gpt-4o-mini + - gpt-4o + +queries: + - "What is the capital of France?" + - "Explain TCP vs UDP." + - "Write a Python prime-checker function." +``` + +See [example_test.yaml](example_test.yaml) and [example_system_prompt.md](example_system_prompt.md). + +## Output format + +For `llmqt prompt.md test1.yaml` with models `gpt-4o-mini` and `gpt-4o`: + +``` +test1/ + gpt-4o-mini.md + gpt-4o.md +``` + +Each file contains: + +- A **statistics table** (elapsed time, prompt/completion tokens, tok/s per query + totals) +- For each query: the query text, per-query stats, optional **Reasoning** section (if the model returns chain-of-thought), and the **Response** + +### Reasoning detection + +Reasoning content is extracted automatically from: +- The `reasoning_content` field on the message (DeepSeek API style) +- `...` tags in the response content (DeepSeek R1 / QwQ open-source style) + +## Execution order + +``` +for each config file: + for each model: + for each query → POST to API, wait for response + write /.md in CWD +``` diff --git a/example_system_prompt.md b/example_system_prompt.md new file mode 100644 index 0000000..da9d152 --- /dev/null +++ b/example_system_prompt.md @@ -0,0 +1 @@ +You are a helpful assistant. Answer questions clearly and concisely. diff --git a/example_test.yaml b/example_test.yaml new file mode 100644 index 0000000..3712dab --- /dev/null +++ b/example_test.yaml @@ -0,0 +1,14 @@ +# llmqt example config +# Run with: llmqt example_system_prompt.md example_test.yaml +# Outputs will be written to ./example_test/.md + +# List of models to test. Any OpenAI-compatible model name works. +models: + - gpt-4o-mini + - gpt-4o + +# List of queries to send to each model (in order). +queries: + - "What is the capital of France?" + - "Explain the difference between TCP and UDP in simple terms." + - "Write a Python function that checks if a number is prime." diff --git a/llmqt.py b/llmqt.py new file mode 100644 index 0000000..3b5f8d4 --- /dev/null +++ b/llmqt.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python3 +"""llmqt - LLM Query Tester""" + +import os +import re +import sys +import json +import time +from pathlib import Path + +try: + import yaml +except ImportError: + yaml = None + +from openai import OpenAI + + +def load_config(config_path: Path) -> dict: + suffix = config_path.suffix.lower() + with open(config_path) as f: + if suffix in ('.yaml', '.yml'): + if yaml is None: + print("Error: pyyaml is required for YAML configs. Run: pip install pyyaml") + sys.exit(1) + return yaml.safe_load(f) + elif suffix == '.json': + return json.load(f) + else: + raise ValueError(f"Unsupported config format: {config_path.suffix} (use .yaml, .yml, or .json)") + + +def run_query( + client: OpenAI, model: str, system_prompt: str, query: str +) -> tuple[str | None, str, dict]: + """ + Send a query and return (reasoning, answer, stats). + + stats keys: + prompt_tokens, completion_tokens, total_tokens, + elapsed_s, tokens_per_sec, error + """ + t0 = time.monotonic() + try: + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": query}, + ], + ) + except Exception as exc: + elapsed = time.monotonic() - t0 + stats = { + "prompt_tokens": 0, + "completion_tokens": 0, + "total_tokens": 0, + "elapsed_s": elapsed, + "tokens_per_sec": 0.0, + "error": str(exc), + } + return None, "", stats + elapsed = time.monotonic() - t0 + + message = response.choices[0].message + content = message.content or "" + + # Some APIs (e.g. DeepSeek) expose reasoning_content as a separate field. + reasoning = getattr(message, "reasoning_content", None) + + if not reasoning: + # Fall back to extracting ... from content. + think_match = re.search(r"(.*?)", content, re.DOTALL) + if think_match: + reasoning = think_match.group(1).strip() + content = re.sub(r".*?", "", content, flags=re.DOTALL).strip() + + usage = response.usage + prompt_tokens = usage.prompt_tokens if usage else 0 + completion_tokens = usage.completion_tokens if usage else 0 + total_tokens = usage.total_tokens if usage else 0 + tps = completion_tokens / elapsed if elapsed > 0 else 0.0 + + stats = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "elapsed_s": elapsed, + "tokens_per_sec": tps, + "error": None, + } + + return reasoning or None, content, stats + + +def sanitize_filename(name: str) -> str: + """Replace characters that are unsafe in filenames.""" + return "".join(c if c.isalnum() or c in "._- " else "_" for c in name).strip() + + +def format_stats_inline(stats: dict) -> str: + return ( + f"{stats['elapsed_s']:.1f}s | " + f"{stats['completion_tokens']} completion tokens | " + f"{stats['tokens_per_sec']:.1f} tok/s" + ) + + +def format_stats_table(indexed_stats: list[tuple[int, dict]]) -> str: + """Render a summary stats table. indexed_stats is [(query_number, stats), ...].""" + all_stats = [s for _, s in indexed_stats] + total_prompt = sum(s["prompt_tokens"] for s in all_stats) + total_comp = sum(s["completion_tokens"] for s in all_stats) + total_tok = sum(s["total_tokens"] for s in all_stats) + total_elapsed = sum(s["elapsed_s"] for s in all_stats) + avg_tps = total_comp / total_elapsed if total_elapsed > 0 else 0.0 + + lines = [ + "| Query | Elapsed | Prompt tok | Completion tok | Total tok | tok/s |", + "|-------|---------|------------|----------------|-----------|-------|", + ] + for i, s in indexed_stats: + lines.append( + f"| {i} " + f"| {s['elapsed_s']:.1f}s " + f"| {s['prompt_tokens']} " + f"| {s['completion_tokens']} " + f"| {s['total_tokens']} " + f"| {s['tokens_per_sec']:.1f} |" + ) + lines.append( + f"| **Total** " + f"| **{total_elapsed:.1f}s** " + f"| **{total_prompt}** " + f"| **{total_comp}** " + f"| **{total_tok}** " + f"| **{avg_tps:.1f}** |" + ) + return "\n".join(lines) + + +def process_config(config_path: Path, system_prompt: str, client: OpenAI) -> None: + config = load_config(config_path) + + for key in ("models", "queries"): + if key not in config: + print(f"Error: Missing '{key}' in {config_path}") + sys.exit(1) + + models = config["models"] + queries = config["queries"] + + output_dir = Path.cwd() / config_path.stem + output_dir.mkdir(parents=True, exist_ok=True) + + for model in models: + print(f"\n Model: {model}") + results: list[tuple[str, str | None, str, dict]] = [] + + for i, query in enumerate(queries, 1): + preview = query[:70] + ("..." if len(query) > 70 else "") + print(f" Query {i}/{len(queries)}: {preview}") + reasoning, answer, stats = run_query(client, model, system_prompt, query) + results.append((query, reasoning, answer, stats)) + if stats["error"]: + print(f" -> ERROR: {stats['error']}") + else: + tag = " [reasoning]" if reasoning else "" + print(f" -> {format_stats_inline(stats)}{tag}") + + model_filename = sanitize_filename(model) + ".md" + output_path = output_dir / model_filename + + with open(output_path, "w") as f: + f.write(f"# {model}\n\n") + f.write(f"**Config:** `{config_path.name}`\n\n") + + # Summary stats table (only successful queries, preserving query number) + successful_stats = [(i, s) for i, (_, _, _, s) in enumerate(results, 1) if not s["error"]] + if successful_stats: + f.write("## Statistics\n\n") + f.write(format_stats_table(successful_stats)) + f.write("\n\n---\n\n") + + for i, (query, reasoning, answer, stats) in enumerate(results, 1): + f.write(f"## Query {i}\n\n") + f.write(f"> {query}\n\n") + + if stats["error"]: + f.write(f"> [!WARNING]\n> **Error:** {stats['error']}\n\n") + f.write("---\n\n") + continue + + f.write( + f"*{stats['elapsed_s']:.1f}s · " + f"{stats['completion_tokens']} completion tokens · " + f"{stats['tokens_per_sec']:.1f} tok/s*\n\n" + ) + + if reasoning: + f.write("### Reasoning\n\n") + f.write(f"{reasoning}\n\n") + + f.write("### Response\n\n") + f.write(f"{answer}\n\n") + f.write("---\n\n") + + print(f" Saved: {output_path}") + + +def main(): + if len(sys.argv) < 3: + print("Usage: llmqt [config2.yaml ...]") + print() + print("Environment variables:") + print(" OPENAI_API_KEY (required)") + print(" OPENAI_API_BASE (optional, for custom endpoints)") + sys.exit(1) + + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + print("Error: OPENAI_API_KEY environment variable not set.") + sys.exit(1) + + api_base = os.environ.get("OPENAI_API_BASE") or os.environ.get("OPENAI_BASE_URL") + client_kwargs = {"api_key": api_key} + if api_base: + client_kwargs["base_url"] = api_base + + client = OpenAI(**client_kwargs) + + prompt_path = Path(sys.argv[1]) + if not prompt_path.exists(): + print(f"Error: System prompt file not found: {prompt_path}") + sys.exit(1) + + with open(prompt_path) as f: + system_prompt = f.read() + + config_paths = [] + for arg in sys.argv[2:]: + p = Path(arg) + if not p.exists(): + print(f"Error: Config file not found: {p}") + sys.exit(1) + config_paths.append(p) + + for config_path in config_paths: + print(f"\nProcessing: {config_path}") + process_config(config_path, system_prompt, client) + + print("\nDone.") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..56293c2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,19 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "llmqt" +version = "0.1.0" +description = "LLM Query Tester — batch-test multiple models with predefined queries" +requires-python = ">=3.9" +dependencies = [ + "openai>=1.0.0", + "pyyaml>=6.0", +] + +[project.scripts] +llmqt = "llmqt:main" + +[tool.setuptools] +py-modules = ["llmqt"]