macrocosm-os
diff --git a/‎.env.miner.example‎
Lines changed: 3 additions & 3 deletions b/‎.env.miner.example‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎neurons/miners/epistula_miner/miner.py‎
Lines changed: 101 additions & 20 deletions b/‎neurons/miners/epistula_miner/miner.py‎
Lines changed: 101 additions & 20 deletions
diff --git a/‎neurons/miners/epistula_miner/web_retrieval.py‎
Lines changed: 2 additions & 7 deletions b/‎neurons/miners/epistula_miner/web_retrieval.py‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎neurons/validator.py‎
Lines changed: 2 additions & 2 deletions b/‎neurons/validator.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎notebooks/demo.ipynb‎
Lines changed: 86 additions & 0 deletions b/‎notebooks/demo.ipynb‎
Lines changed: 86 additions & 0 deletions
@@ -8,13 +8,13 @@ SUBTENSOR_NETWORK = "test"
 SUBTENSOR_CHAIN_ENDPOINT = None
 
 # The name of your wallet.
-WALLET_NAME="miner"
+WALLET_NAME="example_wallet"
 
 # The name of the hotkey associated with the validator wallet.
-HOTKEY="default"
+HOTKEY="example_hotkey"
 
 # Open port which can be used to connect to the network.
-AXON_PORT=22116
+AXON_PORT=12345
 
 # The OpenAI API key (only needed for the OpenAI test miner).
 OPENAI_API_KEY="YOUR_API_HERE"
@@ -14,13 +14,13 @@
 import uvicorn
 from bittensor.core.axon import FastAPIThreadedServer
 from bittensor.core.extrinsics.serving import serve_extrinsic
-from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
+from fastapi import APIRouter, FastAPI, HTTPException, Request
 from loguru import logger
 from starlette.background import BackgroundTask
 from starlette.responses import StreamingResponse
+from vllm import LLM, SamplingParams
 from web_retrieval import get_websites_with_similarity
 
-from prompting.llms.hf_llm import ReproducibleHF
 from shared.epistula import verify_signature
 
 MODEL_ID: str = "gpt-3.5-turbo"
@@ -30,8 +30,56 @@
 NEURON_TOP_P: float = 0.95
 NEURON_STREAMING_BATCH_SIZE: int = 12
 NEURON_STOP_ON_FORWARD_EXCEPTION: bool = False
-SHOULD_SERVE_LLM: bool = False
-LOCAL_MODEL_ID = "casperhansen/llama-3-8b-instruct-awq"
+SHOULD_SERVE_LLM: bool = True
+LOCAL_MODEL_ID = "casperhansen/llama-3.2-3b-instruct-awq"
+
+
+def get_token_logprobs(llm, prompt, sampling_params):
+    """Get logprobs and chosen tokens for text generation."""
+    outputs = llm.generate(prompt, sampling_params)
+
+    if not outputs:
+        return None
+
+    output = outputs[0].outputs[0]
+    generated_text = output.text
+    logprobs_sequence = output.logprobs
+    generated_tokens = output.token_ids
+
+    if logprobs_sequence is None:
+        return None
+
+    token_logprobs = []
+    for i, logprobs in enumerate(logprobs_sequence):
+        if logprobs is None:
+            continue
+
+        # Convert to list and sort by logprob value
+        logprobs_list = [(k, v.logprob) for k, v in logprobs.items()]
+        sorted_logprobs = sorted(logprobs_list, key=lambda x: x[1], reverse=True)
+
+        # Get top tokens and logprobs
+        top_token_ids = [x[0] for x in sorted_logprobs]
+        top_logprob_values = [x[1] for x in sorted_logprobs]
+
+        # Store the actual chosen token from generation
+        chosen_token = llm.get_tokenizer().decode([generated_tokens[i]])
+
+        # Format top logprobs as list of dictionaries
+        top_logprobs = [
+            {"token": llm.get_tokenizer().decode([tid]), "logprob": lp}
+            for tid, lp in zip(top_token_ids, top_logprob_values)
+        ]
+
+        # Store logprobs for this step
+        step_logprobs = {
+            "token": chosen_token,
+            "top_tokens": [llm.get_tokenizer().decode([tid]) for tid in top_token_ids],
+            "top_logprobs": top_logprobs,
+        }
+        token_logprobs.append(step_logprobs)
+
+    return {"text": generated_text, "token_logprobs": token_logprobs}
 
 
 class OpenAIMiner:
@@ -45,11 +93,8 @@ def __init__(self):
             },
         )
         if SHOULD_SERVE_LLM:
-            self.llm = ReproducibleHF(
-                model_id=LOCAL_MODEL_ID,
-                device=shared_settings.NEURON_DEVICE,
-                sampling_params=shared_settings.SAMPLING_PARAMS,
-            )
+            self.llm = LLM(model=LOCAL_MODEL_ID, gpu_memory_utilization=0.3, max_model_len=1000)
+            self.tokenizer = self.llm.get_tokenizer()
         else:
             self.llm = None
 
@@ -83,7 +128,7 @@ async def word_stream(body, headers):
     async def create_chat_completion(self, request: Request):
         data = await request.json()
         headers = request.headers
-        if self.llm and request.headers.get("task", None) == "inference":
+        if self.llm and request.headers.get("task", None) == "InferenceTask":
             return await self.create_inference_completion(request)
         if request.headers.get("task", None) == "WebRetrievalTask":
             return await self.stream_web_retrieval(data, headers)
@@ -93,14 +138,50 @@ async def create_chat_completion(self, request: Request):
 
     async def create_inference_completion(self, request: Request):
         async def word_stream():
-            inference = await self.run_inference(request)
-            words = inference.split()
-            print(words)
-            for word in words:
-                # Simulate the OpenAI streaming response format
-                data = {"choices": [{"delta": {"content": word + " "}, "index": 0, "finish_reason": None}]}
+            data = await request.json()
+            messages = data.get("messages", [])
+            sampling_params = SamplingParams(
+                max_tokens=NEURON_MAX_TOKENS,
+                temperature=NEURON_TEMPERATURE,
+                top_k=NEURON_TOP_K,
+                top_p=NEURON_TOP_P,
+                logprobs=10,
+            )
+
+            prompt = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+
+            # Get generation with logprobs
+            result = get_token_logprobs(self.llm, prompt, sampling_params)
+            if not result:
+                yield f"data: {json.dumps({'error': 'Generation failed'})}\n\n"
+                return
+
+            # Stream tokens and their logprobs
+            for step in result["token_logprobs"]:
+                logger.info(step)
+                token = step["token"]
+                logprobs_info = {"top_logprobs": step["top_logprobs"]}
+
+                # Format in OpenAI streaming style but include logprobs
+                data = {
+                    "choices": [
+                        {
+                            "delta": {
+                                "content": token,
+                            },
+                            "logprobs": {"content": [logprobs_info]},
+                            "index": 0,
+                            "finish_reason": None,
+                        }
+                    ]
+                }
                 yield f"data: {json.dumps(data)}\n\n"
-                await asyncio.sleep(0.1)  # Simulate a delay between words
+                await asyncio.sleep(0.1)
+
             # Indicate the end of the stream
             data = {"choices": [{"delta": {}, "index": 0, "finish_reason": "stop"}]}
             yield f"data: {json.dumps(data)}\n\n"
@@ -119,7 +200,7 @@ async def check_availability(self, request: Request):
 
         # Set all model availabilities to False (openai will not be able to handle seeded inference)
         model_response = {key: key == LOCAL_MODEL_ID for key in llm_model_availabilities}
-
+        print(model_response)
         response = {"task_availabilities": task_response, "llm_model_availabilities": model_response}
 
         return response
@@ -158,7 +239,7 @@ async def verify_request(
             raise HTTPException(status_code=400, detail=err)
 
     def run(self):
-        external_ip = None  # shared_settings.EXTERNAL_IP
+        external_ip = None
         if not external_ip or external_ip == "[::]":
             try:
                 external_ip = requests.get("https://checkip.amazonaws.com").text.strip()
@@ -187,7 +268,7 @@ def run(self):
         router.add_api_route(
             "/v1/chat/completions",
             self.create_chat_completion,
-            dependencies=[Depends(self.verify_request)],
+            # dependencies=[Depends(self.verify_request)],
             methods=["POST"],
         )
         router.add_api_route(
 
@@ -3,16 +3,11 @@
 
 import numpy as np
 import trafilatura
+from duckduckgo_search.duckduckgo_search import DDGS
 from openai import OpenAI
 
-from prompting.base.duckduckgo_patch import PatchedDDGS
 from shared import settings
 
-# Import the patched DDGS and use that
-
-
-# Import the patched DDGS and use that
-
 
 async def fetch_url(url: str) -> str:
     return trafilatura.fetch_url(url)
@@ -54,7 +49,7 @@ async def get_websites_with_similarity(
     Returns:
         List of dictionaries containing website URLs and their best matching chunks
     """
-    ddgs = PatchedDDGS(proxy=settings.shared_settings.PROXY_URL, verify=False)
+    ddgs = DDGS(proxy=settings.shared_settings.PROXY_URL, verify=False)
     results = list(ddgs.text(query))
     urls = [r["href"] for r in results][:n_results]
 
 
@@ -123,7 +123,7 @@ async def spawn_loops(task_queue, scoring_queue, miners_dict: dict):
 
         logger.info("Starting task sending loop in validator...")
         asyncio.create_task(task_sender.start(task_queue, scoring_queue, miners_dict, simultaneous_loops=1))
-        logger.error("Task sending loop started")
+        logger.debug("Task sending loop started")
         while True:
             await asyncio.sleep(5)
             logger.debug("Task sending loop is running")
@@ -195,7 +195,7 @@ async def main(
 
         try:
             # Start checking the availability of miners at regular intervals
-            if settings.shared_settings.DEPLOY_SCORING_API:
+            if settings.shared_settings.DEPLOY_SCORING_API and not settings.shared_settings.NEURON_DISABLE_SET_WEIGHTS:
                 # Use multiprocessing to bypass API blocking issue
                 api_process = mp.Process(
                     target=start_api, args=(scoring_queue, reward_events, miners_dict), name="APIProcess"
 
@@ -1,5 +1,91 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 04-02 16:34:52 [config.py:585] This model supports multiple tasks: {'classify', 'generate', 'score', 'embed', 'reward'}. Defaulting to 'generate'.\n",
+      "INFO 04-02 16:34:54 [awq_marlin.py:114] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.\n",
+      "INFO 04-02 16:34:54 [config.py:1697] Chunked prefill is enabled with max_num_batched_tokens=8192.\n",
+      "INFO 04-02 16:34:55 [core.py:54] Initializing a V1 LLM engine (v0.8.2) with config: model='casperhansen/llama-3.2-3b-instruct-awq', speculative_config=None, tokenizer='casperhansen/llama-3.2-3b-instruct-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=1000, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=casperhansen/llama-3.2-3b-instruct-awq, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={\"level\":3,\"custom_ops\":[\"none\"],\"splitting_ops\":[\"vllm.unified_attention\",\"vllm.unified_attention_with_output\"],\"use_inductor\":true,\"compile_sizes\":[],\"use_cudagraph\":true,\"cudagraph_num_of_warmups\":1,\"cudagraph_capture_sizes\":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],\"max_capture_size\":512}\n",
+      "WARNING 04-02 16:34:55 [utils.py:2321] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7f3b900b7fa0>\n",
+      "INFO 04-02 16:34:56 [parallel_state.py:954] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0\n",
+      "INFO 04-02 16:34:56 [cuda.py:220] Using Flash Attention backend on V1 engine.\n",
+      "INFO 04-02 16:34:56 [gpu_model_runner.py:1174] Starting to load model casperhansen/llama-3.2-3b-instruct-awq...\n",
+      "WARNING 04-02 16:34:56 [topk_topp_sampler.py:63] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.\n",
+      "INFO 04-02 16:34:56 [weight_utils.py:265] Using model weights format ['*.safetensors']\n",
+      "INFO 04-02 16:34:57 [weight_utils.py:315] No model.safetensors.index.json found in remote.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.00it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.99it/s]\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 04-02 16:34:57 [loader.py:447] Loading weights took 0.59 seconds\n",
+      "INFO 04-02 16:34:58 [gpu_model_runner.py:1186] Model loading took 2.1364 GB and 1.841784 seconds\n",
+      "INFO 04-02 16:35:07 [backends.py:415] Using cache directory: /root/.cache/vllm/torch_compile_cache/0b0416c300/rank_0_0 for vLLM's torch.compile\n",
+      "INFO 04-02 16:35:07 [backends.py:425] Dynamo bytecode transform time: 9.07 s\n",
+      "INFO 04-02 16:35:08 [backends.py:115] Directly load the compiled graph for shape None from the cache\n",
+      "INFO 04-02 16:35:14 [monitor.py:33] torch.compile takes 9.07 s in total\n",
+      "INFO 04-02 16:35:15 [kv_cache_utils.py:566] GPU KV cache size: 217,248 tokens\n",
+      "INFO 04-02 16:35:15 [kv_cache_utils.py:569] Maximum concurrency for 1,000 tokens per request: 217.25x\n",
+      "INFO 04-02 16:35:33 [gpu_model_runner.py:1534] Graph capturing finished in 18 secs, took 0.48 GiB\n",
+      "INFO 04-02 16:35:34 [core.py:151] init engine (profile, create kv cache, warmup model) took 35.70 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "from prompting.llms.vllm_llm import ReproducibleVLLM\n",
+    "from loguru import logger\n",
+    "\n",
+    "try: \n",
+    "    llm = ReproducibleVLLM(model_id=\"casperhansen/llama-3.2-3b-instruct-awq\", device=\"cuda:0\", sampling_params={\"temperature\": 0.0})\n",
+    "except Exception as e:\n",
+    "    logger.exception(e)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.37it/s, est. speed input: 9.62 toks/s, output: 137.41 toks/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "\" Congrats on reaching the 1 year milestone in your marriage!\\n\\nI think there may be a few... issues (no, just kidding, or am I?). Seriously though, is this a hypothetical or real scenario? Either way, I think we can have some fun with this!\\n\\nIf you'd like, we could explore some fun conversations, think of some creative writing prompts, or even plan a fun activity together. Let me know what's on your mind, and I'll do my best to help\""
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "await llm.generate(\"Hello, how are you?\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,