ci(issues): add LLM-driven duplicate issue detection (#2381)

2026-02-08 15:45:49 +05:00
parent 2dac679f1b
commit 0ffe3e8067
7 changed files with 1692 additions and 0 deletions
--- a/.github/workflows/detect-duplicate.yml
+++ b/.github/workflows/detect-duplicate.yml
@@ -0,0 +1,72 @@
+# yaml-language-server: $schema=https://json.schemastore.org/github-workflow.json
+name: Duplicate Issue Detector
+
+on:
+  issues:
+    types: [opened]
+
+permissions: {}
+
+env:
+  EMBEDDING_MODEL: ${{ vars.EMBEDDING_MODEL }}
+  GROQ_MODEL: ${{ vars.GROQ_MODEL }}
+  GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
+
+jobs:
+  detect-duplicate:
+    runs-on: ubuntu-24.04
+    if: ${{ !github.event.issue.pull_request }}
+    permissions:
+      issues: write
+      actions: read
+      contents: read
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1
+
+      - name: Set up Node.js
+        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
+        with:
+          node-version-file: 'package.json'
+
+      - name: Cache embedding model
+        uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
+        with:
+          path: ~/.cache/huggingface
+          key: hf-model-${{ vars.EMBEDDING_MODEL }}
+
+      - name: Install dependencies
+        working-directory: bin/duplicate-detector
+        run: npm ci
+
+      - name: Download issue index
+        uses: dawidd6/action-download-artifact@5c98f0b039f36ef966fdb7dfa9779262785ecb05 # v14
+        with:
+          name: issue-index
+          workflow: rebuild-issue-index.yml
+          path: bin/duplicate-detector
+          search_artifacts: true
+          if_no_artifact_found: warn
+
+      - name: Build index if missing
+        working-directory: bin/duplicate-detector
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          INDEX_PATH: issue_index.json
+        run: |
+          if [ ! -f issue_index.json ]; then
+            echo "No index found — building from scratch..."
+            node build-index.mjs
+          fi
+
+      - name: Detect duplicates
+        working-directory: bin/duplicate-detector
+        continue-on-error: true
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          ISSUE_NUMBER: ${{ github.event.issue.number }}
+          INDEX_PATH: issue_index.json
+        run: node detect.mjs
--- a/.github/workflows/rebuild-issue-index.yml
+++ b/.github/workflows/rebuild-issue-index.yml
@@ -0,0 +1,54 @@
+# yaml-language-server: $schema=https://json.schemastore.org/github-workflow.json
+name: Rebuild Issue Index
+
+on:
+  schedule:
+    - cron: "0 3 * * *"
+  workflow_dispatch:
+
+permissions: {}
+
+env:
+  EMBEDDING_MODEL: ${{ vars.EMBEDDING_MODEL }}
+
+jobs:
+  build-index:
+    runs-on: ubuntu-24.04
+    permissions:
+      issues: read
+      actions: write
+      contents: read
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1
+
+      - name: Set up Node.js
+        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5
+        with:
+          node-version-file: 'package.json'
+
+      - name: Cache embedding model
+        uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
+        with:
+          path: ~/.cache/huggingface
+          key: hf-model-${{ vars.EMBEDDING_MODEL }}
+
+      - name: Install dependencies
+        working-directory: bin/duplicate-detector
+        run: npm ci
+
+      - name: Build issue index
+        working-directory: bin/duplicate-detector
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          INDEX_PATH: issue_index.json
+        run: node build-index.mjs
+
+      - name: Upload index artifact
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        with:
+          name: issue-index
+          path: bin/duplicate-detector/issue_index.json
+          retention-days: 7
--- a/bin/duplicate-detector/build-index.mjs
+++ b/bin/duplicate-detector/build-index.mjs
@@ -0,0 +1,120 @@
+#!/usr/bin/env node
+/**
+ * Build Issue Embedding Index
+ *
+ * Fetches all open issues and recently closed ones,
+ * generates embeddings using a local ONNX transformer model,
+ * and saves them as a JSON artifact for the duplicate detector.
+ */
+
+import { pipeline } from '@huggingface/transformers';
+import { mkdirSync, writeFileSync } from 'node:fs';
+import { dirname } from 'node:path';
+import { fetchIssues, issueText } from './utils.mjs';
+
+const MODEL_NAME = process.env.EMBEDDING_MODEL || 'Xenova/all-MiniLM-L6-v2';
+const OUTPUT_PATH = 'issue_index.json';
+const INCLUDE_CLOSED_DAYS = 90;
+const MAX_ISSUES = 5000;
+const BATCH_SIZE = 64;
+
+async function main() {
+  console.log('Fetching open issues...');
+  const openIssues = await fetchIssues({
+    state: 'open',
+    maxIssues: MAX_ISSUES,
+  });
+  console.log(`Fetched ${openIssues.length} open issues`);
+
+  const since = new Date(
+    Date.now() - INCLUDE_CLOSED_DAYS * 24 * 60 * 60 * 1000
+  ).toISOString();
+  console.log(
+    `Fetching closed issues from last ${INCLUDE_CLOSED_DAYS} days...`
+  );
+
+  const closedIssues = await fetchIssues({
+    state: 'closed',
+    since,
+    maxIssues: MAX_ISSUES,
+  });
+  console.log(`Fetched ${closedIssues.length} closed issues`);
+  let allIssues = [...openIssues, ...closedIssues];
+
+  const seen = new Set();
+  allIssues = allIssues.filter((issue) => {
+    if (seen.has(issue.number)) return false;
+    seen.add(issue.number);
+    return true;
+  });
+
+  console.log(`Total unique issues to index: ${allIssues.length}`);
+
+  if (allIssues.length === 0) {
+    console.warn('No issues found - writing empty index');
+    writeFileSync(OUTPUT_PATH, JSON.stringify({ issues: [], embeddings: [] }));
+    return;
+  }
+
+  console.log(`Loading model: ${MODEL_NAME}`);
+  const extractor = await pipeline('feature-extraction', MODEL_NAME, {
+    dtype: 'fp32',
+  });
+
+  const texts = allIssues.map((issue) => issueText(issue.title, issue.body));
+  const allEmbeddings = [];
+
+  console.log(`Generating embeddings for ${texts.length} issues...`);
+  for (let i = 0; i < texts.length; i += BATCH_SIZE) {
+    const batch = texts.slice(i, i + BATCH_SIZE);
+    const output = await extractor(batch, {
+      pooling: 'mean',
+      normalize: true,
+    });
+
+    const vectors = output.tolist();
+    allEmbeddings.push(...vectors);
+
+    const progress = Math.min(i + BATCH_SIZE, texts.length);
+    console.log(`  ${progress}/${texts.length}`);
+  }
+
+  const issueMetadata = allIssues.map((issue) => {
+    const body = (issue.body || '').trim();
+    return {
+      number: issue.number,
+      title: issue.title,
+      state: issue.state,
+      url: issue.html_url,
+      body_preview: body.slice(0, 500) || '',
+      labels: (issue.labels || []).map((l) => l.name),
+      created_at: issue.created_at,
+      updated_at: issue.updated_at,
+    };
+  });
+
+  const indexData = {
+    issues: issueMetadata,
+    embeddings: allEmbeddings,
+    model: MODEL_NAME,
+    issue_count: issueMetadata.length,
+    built_at: new Date().toISOString(),
+  };
+
+  const dir = dirname(OUTPUT_PATH);
+  if (dir && dir !== '.') mkdirSync(dir, { recursive: true });
+  writeFileSync(OUTPUT_PATH, JSON.stringify(indexData));
+
+  const sizeMb = (
+    Buffer.byteLength(JSON.stringify(indexData)) /
+    (1024 * 1024)
+  ).toFixed(1);
+  console.log(
+    `Index saved to ${OUTPUT_PATH} (${sizeMb} MB, ${issueMetadata.length} issues)`
+  );
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
--- a/bin/duplicate-detector/detect.mjs
+++ b/bin/duplicate-detector/detect.mjs
@@ -0,0 +1,274 @@
+#!/usr/bin/env node
+/**
+ * Duplicate Issue Detector
+ *
+ * Triggered on new issue creation. Compares the new issue against an
+ * existing embedding index, then uses an LLM to
+ * confirm duplicates before posting a comment for maintainer review.
+ */
+
+import { pipeline } from '@huggingface/transformers';
+import { existsSync, readFileSync } from 'node:fs';
+import {
+  addLabel,
+  dotProduct,
+  fetchIssues,
+  getIssue,
+  issueText,
+  postComment,
+} from './utils.mjs';
+
+const SIMILARITY_THRESHOLD = 0.55;
+const TOP_K = 5;
+const MAX_COMMENT_CANDIDATES = 3;
+const MODEL_NAME = process.env.EMBEDDING_MODEL || 'Xenova/all-MiniLM-L6-v2';
+const GROQ_MODEL = process.env.GROQ_MODEL || 'llama-3.3-70b-versatile';
+const INDEX_PATH = 'issue_index.json';
+const LABEL_NAME = 'possible-duplicate';
+
+const GROQ_API_KEY = process.env.GROQ_API_KEY || '';
+const ISSUE_NUMBER = parseInt(process.env.ISSUE_NUMBER, 10);
+
+function loadIndex(path) {
+  if (!existsSync(path)) {
+    console.error(
+      `Index file not found at ${path}. Run build-index.mjs first.`
+    );
+    process.exit(1);
+  }
+
+  const data = JSON.parse(readFileSync(path, 'utf-8'));
+  console.log(`Loaded index with ${data.issues.length} issues`);
+  return data;
+}
+
+function findSimilar(
+  queryEmbedding,
+  index,
+  { topK = TOP_K, threshold = SIMILARITY_THRESHOLD, excludeNumber } = {}
+) {
+  const { issues, embeddings } = index;
+  if (!issues.length) return [];
+
+  const scored = issues.map((issue, i) => ({
+    ...issue,
+    score: dotProduct(queryEmbedding, embeddings[i]),
+  }));
+
+  return scored
+    .sort((a, b) => b.score - a.score)
+    .filter(
+      (c) =>
+        c.score >= threshold && (!excludeNumber || c.number !== excludeNumber)
+    )
+    .slice(0, topK);
+}
+
+const CONFIRM_SYSTEM_PROMPT = `You are a GitHub issue triage assistant. You will be given a NEW issue and one \
+or more CANDIDATE issues that may be duplicates.
+
+For each candidate, determine if the new issue is truly a duplicate (same root \
+problem/request) or merely related (similar area but different issue).
+
+Respond ONLY with a JSON array of objects, each with:
+- "number": the candidate issue number
+- "duplicate": true or false
+- "reason": one-sentence explanation
+
+Example:
+[{"number": 123, "duplicate": true, "reason": "Both report the same crash when ..."}]`;
+
+async function confirmWithLlm(newIssue, candidates) {
+  if (!GROQ_API_KEY) {
+    console.warn('GROQ_API_KEY not set — skipping LLM confirmation');
+    return candidates;
+  }
+
+  const candidateText = candidates
+    .map(
+      (c) =>
+        `### Candidate #${c.number} (similarity: ${c.score.toFixed(2)})\n` +
+        `**Title:** ${c.title}\n` +
+        `**State:** ${c.state}\n` +
+        `**Body preview:** ${(c.body_preview || 'N/A').slice(0, 500)}`
+    )
+    .join('\n\n');
+
+  const userPrompt =
+    `## NEW ISSUE #${newIssue.number}\n` +
+    `**Title:** ${newIssue.title}\n` +
+    `**Body:**\n${(newIssue.body || 'No body').slice(0, 1500)}\n\n` +
+    `---\n\n` +
+    `## CANDIDATES\n${candidateText}`;
+
+  try {
+    const resp = await fetch(
+      'https://api.groq.com/openai/v1/chat/completions',
+      {
+        method: 'POST',
+        headers: {
+          Authorization: `Bearer ${GROQ_API_KEY}`,
+          'Content-Type': 'application/json',
+        },
+        body: JSON.stringify({
+          model: GROQ_MODEL,
+          messages: [
+            { role: 'system', content: CONFIRM_SYSTEM_PROMPT },
+            { role: 'user', content: userPrompt },
+          ],
+          temperature: 0.1,
+          max_tokens: 1024,
+        }),
+        signal: AbortSignal.timeout(30_000),
+      }
+    );
+
+    if (!resp.ok) {
+      const text = await resp.text();
+      throw new Error(`Groq API error ${resp.status}: ${text}`);
+    }
+
+    let content = (await resp.json()).choices[0].message.content.trim();
+
+    if (content.startsWith('```')) {
+      content = content
+        .split('\n')
+        .slice(1)
+        .join('\n')
+        .replace(/```\s*$/, '')
+        .trim();
+    }
+
+    const verdicts = JSON.parse(content);
+    if (!Array.isArray(verdicts)) {
+      throw new Error('Invalid LLM response format - expected array');
+    }
+
+    const verdictMap = new Map(verdicts.map((v) => [v.number, v]));
+
+    const confirmed = [];
+    for (const c of candidates) {
+      const verdict = verdictMap.get(c.number);
+      if (verdict?.duplicate) {
+        c.llm_reason = verdict.reason || '';
+        confirmed.push(c);
+      } else {
+        const reason = verdict?.reason || 'not evaluated';
+        console.log(`  #${c.number} ruled out by LLM: ${reason}`);
+      }
+    }
+
+    return confirmed;
+  } catch (err) {
+    console.warn(
+      `LLM confirmation failed: ${err.message} - falling back to all candidates`
+    );
+    return candidates;
+  }
+}
+
+function formatComment(candidates) {
+  const lines = [
+    '**Possible duplicate detected**',
+    '',
+    'This issue may be a duplicate of the following (detected via semantic similarity + LLM review):',
+    '',
+  ];
+
+  for (const c of candidates.slice(0, MAX_COMMENT_CANDIDATES)) {
+    const confidence = `${(c.score * 100).toFixed(0)}%`;
+    let line = `- #${c.number} (${confidence} match) — ${c.title}`;
+    if (c.llm_reason) {
+      line += `\n  > *${c.llm_reason}*`;
+    }
+    lines.push(line);
+  }
+
+  lines.push(
+    '',
+    'A maintainer will review this. If this is **not** a duplicate, no action is needed.',
+    '',
+    `<!-- duplicate-bot: candidates=${candidates.map((c) => c.number).join(',')} -->`
+  );
+
+  return lines.join('\n');
+}
+
+async function main() {
+  if (!ISSUE_NUMBER) {
+    console.error('ISSUE_NUMBER not set');
+    process.exit(1);
+  }
+
+  console.log(`Processing issue #${ISSUE_NUMBER}`);
+  const issue = await getIssue(ISSUE_NUMBER);
+
+  const oneHourAgo = new Date(Date.now() - 60 * 60 * 1000).toISOString();
+  const recentIssues = await fetchIssues({
+    creator: issue.user.login,
+    since: oneHourAgo,
+    state: 'all',
+  });
+
+  if (recentIssues.length > 10) {
+    console.log(
+      `User ${issue.user.login} created ${recentIssues.length} issues in the last hour - skipping to prevent spam`
+    );
+    return;
+  }
+
+  if (issue.pull_request) {
+    console.log('Skipping - this is a pull request');
+    return;
+  }
+
+  if (issue.user.type === 'Bot') {
+    console.log('Skipping - issue created by bot');
+    return;
+  }
+
+  console.log(`Loading model: ${MODEL_NAME}`);
+  const extractor = await pipeline('feature-extraction', MODEL_NAME, {
+    dtype: 'fp32',
+  });
+  const index = loadIndex(INDEX_PATH);
+
+  const text = issueText(issue.title, issue.body);
+  const output = await extractor(text, { pooling: 'mean', normalize: true });
+  const queryEmbedding = output.tolist()[0];
+
+  let candidates = findSimilar(queryEmbedding, index, {
+    topK: TOP_K,
+    threshold: SIMILARITY_THRESHOLD,
+    excludeNumber: issue.number,
+  });
+
+  if (!candidates.length) {
+    console.log('No similar issues found above threshold - done');
+    return;
+  }
+
+  console.log(`Found ${candidates.length} candidates above threshold:`);
+  for (const c of candidates) {
+    console.log(`  #${c.number} (${c.score.toFixed(3)}) - ${c.title}`);
+  }
+
+  console.log('Running LLM confirmation via Groq...');
+  candidates = await confirmWithLlm(issue, candidates);
+
+  if (!candidates.length) {
+    console.log('LLM ruled out all candidates - done');
+    return;
+  }
+
+  const comment = formatComment(candidates);
+  await postComment(ISSUE_NUMBER, comment);
+  await addLabel(ISSUE_NUMBER, LABEL_NAME);
+
+  console.log('Done!');
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
--- a/bin/duplicate-detector/package-lock.json
+++ b/bin/duplicate-detector/package-lock.json
--- a/bin/duplicate-detector/package.json
+++ b/bin/duplicate-detector/package.json
@@ -0,0 +1,13 @@
+{
+  "name": "duplicate-detector",
+  "version": "1.0.0",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "build-index": "node build-index.mjs",
+    "detect": "node detect.mjs"
+  },
+  "dependencies": {
+    "@huggingface/transformers": "^3.8.1"
+  }
+}
--- a/bin/duplicate-detector/utils.mjs
+++ b/bin/duplicate-detector/utils.mjs
@@ -0,0 +1,116 @@
+const GITHUB_API = 'https://api.github.com';
+const GITHUB_TOKEN = process.env.GITHUB_TOKEN;
+const GITHUB_REPOSITORY = process.env.GITHUB_REPOSITORY;
+
+function ghHeaders() {
+  return {
+    Authorization: `token ${GITHUB_TOKEN}`,
+    Accept: 'application/vnd.github+json',
+  };
+}
+
+export async function fetchIssues({
+  state = 'open',
+  since,
+  maxIssues = 5000,
+} = {}) {
+  const issues = [];
+  let page = 1;
+  const perPage = 100;
+
+  while (issues.length < maxIssues) {
+    const params = new URLSearchParams({
+      state,
+      per_page: String(perPage),
+      page: String(page),
+      sort: 'updated',
+      direction: 'desc',
+    });
+    if (since) params.set('since', since);
+
+    const url = `${GITHUB_API}/repos/${GITHUB_REPOSITORY}/issues?${params}`;
+    const resp = await fetch(url, { headers: ghHeaders() });
+
+    if (!resp.ok) {
+      throw new Error(`GitHub API error: ${resp.status} ${resp.statusText}`);
+    }
+
+    const batch = await resp.json();
+    if (!batch.length) break;
+
+    for (const item of batch) {
+      if (!item.pull_request) {
+        issues.push(item);
+      }
+    }
+
+    page++;
+    if (batch.length < perPage) break;
+  }
+
+  return issues.slice(0, maxIssues);
+}
+
+export async function getIssue(issueNumber) {
+  const url = `${GITHUB_API}/repos/${GITHUB_REPOSITORY}/issues/${issueNumber}`;
+  const resp = await fetch(url, { headers: ghHeaders() });
+
+  if (!resp.ok) {
+    throw new Error(`GitHub API error: ${resp.status} ${resp.statusText}`);
+  }
+
+  return resp.json();
+}
+
+export async function postComment(issueNumber, body) {
+  const url = `${GITHUB_API}/repos/${GITHUB_REPOSITORY}/issues/${issueNumber}/comments`;
+  const resp = await fetch(url, {
+    method: 'POST',
+    headers: { ...ghHeaders(), 'Content-Type': 'application/json' },
+    body: JSON.stringify({ body }),
+  });
+
+  if (!resp.ok) {
+    throw new Error(
+      `Failed to post comment: ${resp.status} ${resp.statusText}`
+    );
+  }
+
+  console.log(`Posted comment on #${issueNumber}`);
+}
+
+export async function addLabel(issueNumber, label) {
+  const url = `${GITHUB_API}/repos/${GITHUB_REPOSITORY}/issues/${issueNumber}/labels`;
+  const resp = await fetch(url, {
+    method: 'POST',
+    headers: { ...ghHeaders(), 'Content-Type': 'application/json' },
+    body: JSON.stringify({ labels: [label] }),
+  });
+
+  if (resp.status === 404) {
+    console.warn(
+      `Label '${label}' does not exist - skipping. Create it manually.`
+    );
+    return;
+  }
+
+  if (!resp.ok) {
+    throw new Error(`Failed to add label: ${resp.status} ${resp.statusText}`);
+  }
+
+  console.log(`Added label '${label}' to #${issueNumber}`);
+}
+
+export function issueText(title, body) {
+  body = (body || '').trim();
+  if (body.length > 2000) body = body.slice(0, 2000) + '...';
+  return body ? `${title}\n\n${body}` : title;
+}
+
+export function dotProduct(a, b) {
+  let sum = 0;
+  for (let i = 0; i < a.length; i++) {
+    sum += a[i] * b[i];
+  }
+  return sum;
+}