activeloopai · kaghni · May 11, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -0,0 +1,87 @@
+name: E2E (cross-agent matrix)
+
+# Manual trigger only. This workflow spawns real agent CLIs against real
+# provider APIs and a dedicated Deeplake test workspace — every run costs
+# real money and takes ~10 minutes. We deliberately do NOT run it on
+# every PR; the source + bundle byte-checks in `npm test` keep gating
+# merges. Use this workflow as a release-readiness gate by triggering it
+# manually from the Actions tab against your feature branch.
+
+on:
+  workflow_dispatch:
+    inputs:
+      case_filter:
+        description: "Only run this case id (e.g. 01-capture-smoke). Leave blank for all."
+        required: false
+        type: string
+      agent_filter:
+        description: "Only run this agent id (e.g. claude-code). Leave blank for all."
+        required: false
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  e2e:
+    name: Tier-1 cross-agent matrix
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    # Gate the job on creds being present. Forks without the e2e secret
+    # see a clean skip in the Actions UI rather than a misleading red.
+    if: ${{ github.event.repository.full_name == 'activeloopai/hivemind' }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 22
+
+      - name: Install dependencies
+        run: npm install
+
+      - name: Build bundles
+        # The harness drives the actual bundles for codex/cursor/hermes/pi
+        # (claude-code uses --plugin-dir against the source tree). Without
+        # build, `hivemind <agent> install` would copy stale or missing
+        # bundle files into the tmp HOME.
+        run: npm run build
+
+      - name: Install agent CLIs
+        # Each tier-1 agent CLI must be on PATH for its driver to spawn.
+        # We install the npm-distributed CLIs here; cursor-agent and
+        # hermes are typically installed via the agent vendor's own
+        # installer outside the npm ecosystem. If those binaries are
+        # not on a CI runner, their driver will fail with a clear
+        # "spawn error" and the matrix continues.
+        run: |
+          npm install -g @anthropic-ai/claude-code @openai/codex
+          # Pi ships via npm too.
+          npm install -g @piapp/cli || true
+          # cursor-agent and hermes — install via curl when available;
+          # if not, their points fail loudly rather than silently skip.
+          curl -fsSL https://cursor.com/install-cli.sh | bash -s -- --print 2>/dev/null || echo "cursor-agent install skipped"
+          # Hermes install would go here; install method varies by vendor.
+          which claude codex pi cursor-agent hermes 2>&1 || true
+
+      - name: Run e2e matrix
+        env:
+          HIVEMIND_E2E_CREDS_JSON: ${{ secrets.HIVEMIND_E2E_CREDS_JSON }}
+          ANTHROPIC_API_KEY: ${{ secrets.HIVEMIND_E2E_ANTHROPIC_API_KEY }}
+          OPENAI_API_KEY: ${{ secrets.HIVEMIND_E2E_OPENAI_API_KEY }}
+          GOOGLE_API_KEY: ${{ secrets.HIVEMIND_E2E_GOOGLE_API_KEY }}
+        run: |
+          args=()
+          if [ -n "${{ inputs.case_filter }}" ]; then args+=(--case "${{ inputs.case_filter }}"); fi
+          if [ -n "${{ inputs.agent_filter }}" ]; then args+=(--agent "${{ inputs.agent_filter }}"); fi
+          npm run e2e -- "${args[@]}"
+
+      - name: Upload summary artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: e2e-summary
+          path: tests/e2e/results/
+          if-no-files-found: warn
+          retention-days: 30
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,5 @@ bun.lock
 deploy-to-cache.sh
 .followups-pr97.md
 .followups-pr98.md
+# e2e harness per-run output artifacts (summary.json + sandbox dumps)
+tests/e2e/results/
diff --git a/README.md b/README.md
@@ -316,6 +316,13 @@ Interactive shell against Deeplake:
 npm run shell
 ```
 
+Cross-agent end-to-end matrix — drives all six agent runtimes (five CLI subprocess, OpenClaw via programmatic event firing) through real prompts against a Deeplake test workspace; manually triggered, not on every PR:
+
+```bash
+npm run e2e            # full matrix; see tests/e2e/README.md for env vars
+npm run e2e -- --list  # print the matrix without spawning
+```
+
 ## License
 
 Apache License 2.0 — © Activeloop, Inc. See [LICENSE](LICENSE) for details.

diff --git a/package.json b/package.json
@@ -37,6 +37,7 @@
     "cli": "tsx src/cli/index.ts",
     "test": "vitest run",
     "typecheck": "tsc --noEmit",
+    "e2e": "tsx tests/e2e/runner.ts",
     "dup": "jscpd src",
     "audit:openclaw": "node scripts/audit-openclaw-bundle.mjs",
     "pack:check": "node scripts/pack-check.mjs",

diff --git a/src/deeplake-api.ts b/src/deeplake-api.ts
@@ -352,15 +352,24 @@ export class DeeplakeApi {
 
     // Column confirmed missing: ALTER without IF NOT EXISTS so any failure is
     // surfaced. The single tolerated exception is a race with another writer
-    // that adds the column between our SELECT and our ALTER — re-SELECT to
-    // confirm and treat as success. Everything else propagates.
+    // that adds the column between our SELECT and our ALTER — Deeplake replies
+    // "already exists" and we treat it as success.
+    //
+    // We deliberately do NOT re-SELECT `colCheck` to confirm presence after the
+    // "already exists" error. On workspaces where pg's `table_schema` doesn't
+    // match our `workspaceId` (e.g. workspaces whose backend uses a different
+    // schema name than the logical workspace name — observed on the
+    // hivemind_e2e_test workspace), the recheck would false-negate, causing us
+    // to re-throw the "already exists" error and crash the entire ensureTable
+    // flow even though the column was provably present. ALTER's "already
+    // exists" verdict is authoritative — the SQL engine can't lie about its
+    // own catalog state — so we trust it and mark the column as present.
     try {
       await this.query(`ALTER TABLE "${table}" ADD COLUMN ${column} ${sqlType}`);
     } catch (e: unknown) {
       const msg = e instanceof Error ? e.message : String(e);
       if (!/already exists/i.test(msg)) throw e;
-      const recheck = await this.query(colCheck);
-      if (recheck.length === 0) throw e;
+      // ALTER said "already exists" → column is present. Fall through to mark.
     }
     markers.writeIndexMarker(markerPath);
   }

diff --git a/tests/claude-code/deeplake-api.test.ts b/tests/claude-code/deeplake-api.test.ts
@@ -494,33 +494,45 @@ describe("DeeplakeApi.ensureTable", () => {
     await expect(api.ensureTable()).rejects.toThrow();
   });
 
-  it("tolerates 'Column already exists' on ALTER ONLY when re-SELECT confirms the race winner landed", async () => {
+  it("tolerates 'Column already exists' on ALTER without re-SELECT (ALTER's verdict is authoritative)", async () => {
+    // Before: this path required a re-SELECT to confirm the column was
+    // present before treating the ALTER error as success. That recheck
+    // false-negated on workspaces where pg's `table_schema` doesn't
+    // match the logical workspaceId (observed live on hivemind_e2e_test),
+    // causing ensureTable to crash for the whole session even though the
+    // SQL engine had just told us the column existed. The fix: trust
+    // ALTER's "already exists" verdict outright. This test locks in the
+    // new behaviour — no recheck, no crash.
     mockFetch.mockResolvedValueOnce({
       ok: true, status: 200,
       json: async () => ({ tables: [{ table_name: "my_table" }] }),
     });
-    // SELECT misses (concurrent run hasn't added the column yet)
+    // SELECT misses (e.g. workspace's table_schema filter false-negated)
     mockFetch.mockResolvedValueOnce(jsonResponse({ columns: ["?column?"], rows: [] }));
-    // ALTER fails with the deterministic "already exists" — race lost
+    // ALTER fails with the deterministic "already exists" — column is present
     mockFetch.mockResolvedValueOnce(
       jsonResponse(`{"error":"Database error: Failed to add column 'summary_embedding' to deeplake dataset: Column 'summary_embedding' already exists","code":"QUERY_ERROR"}`, 500),
     );
-    // Re-SELECT confirms the column is now present (race winner's ALTER landed)
-    mockFetch.mockResolvedValueOnce(jsonResponse({ columns: ["?column?"], rows: [[1]] }));
-    // agent SELECT info_schema → present
+    // No re-SELECT call expected — we trust ALTER's verdict.
+    // Next: agent SELECT info_schema → present
     mockFetch.mockResolvedValueOnce(jsonResponse({ columns: ["?column?"], rows: [[1]] }));
     // plugin_version SELECT info_schema → present
     mockFetch.mockResolvedValueOnce(jsonResponse({ columns: ["?column?"], rows: [[1]] }));
     const api = makeApi("my_table");
     await expect(api.ensureTable()).resolves.toBeUndefined();
-    expect(mockFetch).toHaveBeenCalledTimes(6);
+    expect(mockFetch).toHaveBeenCalledTimes(5); // was 6 with the recheck
     // 2nd call to ensureTable: listTables cached AND all column markers cached → 0 new fetches
     mockFetch.mockReset();
     await api.ensureTable();
     expect(mockFetch).toHaveBeenCalledTimes(0);
   });
 
-  it("rejects 'Column already exists' on ALTER when re-SELECT still reports missing (genuine schema problem)", async () => {
+  it("tolerates 'Column already exists' even when a hypothetical re-SELECT would still report missing", async () => {
+    // Regression guard for the exact production scenario the live e2e
+    // surfaced on hivemind_e2e_test: information_schema's table_schema
+    // filter returned 0 rows AND ALTER said 'already exists'. The old
+    // code rejected here ("genuine schema problem") — wrong, because
+    // ALTER cannot lie about its own catalog. The new code resolves.
     mockFetch.mockResolvedValueOnce({
       ok: true, status: 200,
       json: async () => ({ tables: [{ table_name: "my_table" }] }),
@@ -529,9 +541,12 @@ describe("DeeplakeApi.ensureTable", () => {
     mockFetch.mockResolvedValueOnce(
       jsonResponse(`{"error":"Database error: Column 'summary_embedding' already exists","code":"QUERY_ERROR"}`, 500),
     );
-    mockFetch.mockResolvedValueOnce(jsonResponse({ columns: ["?column?"], rows: [] })); // re-SELECT: still missing
+    // No re-SELECT call any more — would have been 'still missing' here
+    // under the old logic, but we don't issue it. Continue to the next column.
+    mockFetch.mockResolvedValueOnce(jsonResponse({ columns: ["?column?"], rows: [[1]] })); // agent
+    mockFetch.mockResolvedValueOnce(jsonResponse({ columns: ["?column?"], rows: [[1]] })); // plugin_version
     const api = makeApi("my_table");
-    await expect(api.ensureTable()).rejects.toThrow();
+    await expect(api.ensureTable()).resolves.toBeUndefined();
   });
 
   it("creates table with custom name", async () => {

diff --git a/tests/claude-code/schema-scenarios.test.ts b/tests/claude-code/schema-scenarios.test.ts
@@ -353,19 +353,25 @@ describe("scenario 7 — MIXED SESS-EMB (memory no-emb, sessions with-emb)", ()
 // ── Cross-cutting invariants ────────────────────────────────────────────────
 
 describe("schema scenarios — cross-cutting invariants", () => {
-  it("ALTER 'column already exists' (concurrent writer race) is the ONLY tolerated error — re-SELECT confirms and ensureTable resolves", async () => {
-    // Single tolerated race: another writer added the column between our
-    // SELECT (missing) and our ALTER (already-exists). Re-SELECT confirms
-    // the column exists now → success. All other ALTER failures propagate.
+  it("ALTER 'column already exists' is treated as success on its own — no re-SELECT needed", async () => {
+    // Race semantics: another writer added the column between our SELECT
+    // (missing) and our ALTER (already-exists). The fix this test guards:
+    // ALTER's verdict is authoritative on its own; we do NOT issue a
+    // re-SELECT to confirm. The re-SELECT used the same `table_schema =
+    // workspaceId` filter as the initial check, so on workspaces where
+    // that filter false-negates (live e2e symptom) the recheck would
+    // still report "missing" and crash ensureTable for the whole session.
     vi.restoreAllMocks();
     const api = new DeeplakeApi("tok", "https://api.example", "org", "ws", "memory");
     vi.spyOn(api, "listTables").mockResolvedValue(["memory", "sessions"]);
 
     let memSchemaSelectCount = 0;
     vi.spyOn(api, "query").mockImplementation(async (sql: string) => {
       if (SCHEMA_MEM.test(sql)) {
-        // First SELECT misses; re-SELECT after the racy ALTER finds it present.
-        return memSchemaSelectCount++ === 0 ? [] : PRESENT.rows;
+        // SELECT misses on every call (filter-mismatch shape). The fix
+        // means we never re-check after ALTER, so this counter ends at 1.
+        memSchemaSelectCount++;
+        return [];
       }
       if (ALTER_MEM.test(sql)) {
         throw new Error(`Query failed: ${ALREADY_EXISTS("summary_embedding").errorStatus}: ${ALREADY_EXISTS("summary_embedding").errorBody}`);
@@ -381,7 +387,7 @@ describe("schema scenarios — cross-cutting invariants", () => {
 
     await expect(api.ensureTable()).resolves.toBeUndefined();
     await expect(api.ensureSessionsTable("sessions")).resolves.toBeUndefined();
-    expect(memSchemaSelectCount).toBe(2); // initial miss + re-confirm
+    expect(memSchemaSelectCount).toBe(1); // initial miss only; no recheck
   });
 
   it("ALTER errors that are NOT 'already exists' propagate — ensureTable rejects (no silent swallow)", async () => {