-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathollama_image_vision.py
129 lines (112 loc) · 8.87 KB
/
ollama_image_vision.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import torch
import base64
from PIL import Image
import numpy as np
from io import BytesIO
import requests
import json
import ollama
from ollama import Client
# import logging
import hashlib
from typing import Dict, Any
from PIL.PngImagePlugin import PngInfo
class OllamaImageVision:
@classmethod
def INPUT_TYPES(cls):
return {
"required": {
"IMAGE": ("IMAGE",),
"output_selection": ("INT", {"default": 7, "min": 1, "max": 8,
"step": 1, "display": "slider", "label": "Number of outputs (1-8)"}),
"process_below_output_selection": ("BOOLEAN", {"default": False, "label": "Process all up to selection"})
},
"optional": {
"OLLAMA_CONFIG": ("OLLAMA_CONFIG", {"forceInput": True}),
}
}
RETURN_TYPES = ("STRING", "STRING", "STRING", "STRING", "STRING", "STRING", "STRING", "STRING",)
RETURN_NAMES = ("1 - Basic Description", "2 - Advanced Description", "3 - Characters Description", "4 - Object Recognition", "5 - Semantic Understanding", "6 - Contextual Analysis", "7 - SDXL Prompt (words)", "8 - FLUX Prompt (sentences)")
FUNCTION = "process_image_base64"
CATEGORY = "Bjornulf"
def __init__(self):
self.client = None
def create_prompts(self):
# return {
# "basic": "Describe the image in one sentence.",
# "advanced": "Describe the image in details.",
# "characters": "Describe the physical appearance of the character in vivid detail.",
# "objects": "List the key objects and elements visible in the image.",
# "semantic": "Provide an in-depth interpretation including mood, environment, and purpose in the image.",
# "context": "Describe the relationships and context between objects and people in the image."
# }
return {
# Do not describe what isn't there.
"basic": "Summarize the main content of the image in one concise sentence.",
"advanced": "Describe the scene thoroughly, capturing intricate details, colors, textures, and any significant actions or events occurring in the image.",
"characters": "Describe each character's physical appearance in vivid, descriptive terms, including clothing, expressions, body language, and notable features.",
"objects": "Identify and describe the primary objects in the image, detailing their size, position, color, and any unique characteristics.",
"semantic": "Analyze the image's mood, environment, and implied meaning. Discuss any symbolic elements, artistic style, and possible intent or story conveyed.",
"context": "Describe the relationships and interactions between objects and characters, focusing on spatial arrangement, implied actions, and any contextual clues suggesting connections or purpose.",
"SDXL": "Describe the image. The goal is to generate a concise, detailed, and effective description. Guidelines for describing the image:- Focus on visual elements, be specific about objects, colors, textures, and compositions. Use adjectives to describe key features. Avoid complete sentences or narrative descriptions. Prioritize important elements over minor details. Your input will be a detailed description of an image. Process this description and refine it into a prompt suitable for stable diffusion models using the following steps: 1. Identify the most important visual elements and characteristics. 2. Condense the description into a series of comma-separated phrases or words. 3. Prioritize specific, descriptive terms over general ones. Here are two examples of good outputs: Example 1:vibrant sunset, tropical beach, silhouetted palm trees, calm ocean, orange and purple sky, wispy clouds, golden sand, gentle waves, beachgoers in distance, serene atmosphere, warm lighting, panoramic view. Example 2: steampunk cityscape, towering clockwork structures, brass and copper tones, billowing steam, airships in sky, cobblestone streets, Victorian-era citizens, gears and pipes visible, warm sepia lighting, hazy atmosphere, intricate mechanical details. Your final output should be a single line of text containing the refined prompt, without any additional explanation or commentary. IMPORTANT : DO NOT Include information about the overall style or artistic technique.",
"FLUX": "Describe the given image in a detailed and structured format that is specifically designed for image generation. Use descriptive language to capture the essence of the image, including the environment, objects, characters, lighting, textures, and any other notable elements. The description must use some of these 9 points : 1. Scene Type: [Outdoor/Indoor/Abstract/Fantasy/Realistic/etc.] 2. Primary Subject: [Main focus or characters in the scene.] 3. Environment Details: [Describe the setting in vivid detail, including any landscapes, architecture, or surroundings.] 4. Lighting: [Specify the type, color, and intensity of the lighting.] 5. Colors and Tones: [Dominant colors and overall mood.] 6. Perspective: [Camera angle or viewpoint—close-up, wide shot, aerial, etc.] 7. Texture and Details: [Surface materials, patterns, and fine details.] 8. Emotion or Atmosphere: [Mood conveyed by the scene—serene, ominous, lively, etc.] 9. Unique Elements: [Special features or focal points that make the image distinctive.] For example: 1. Scene Type: Outdoor, natural landscape. 2. Primary Subject: A majestic lion standing atop a rocky hill. 3. Environment Details: A vast savannah with tall golden grass, sparse acacia trees, and distant mountains under a clear blue sky. 4. Lighting: Bright, warm sunlight casting long shadows. 5. Colors and Tones: Predominantly gold and blue, with subtle earthy browns and greens. 6. Perspective: Mid-range shot, slightly low angle to emphasize the lion's dominance. 7. Texture and Details: The lion's fur appears detailed with visible strands, and the rocks have a rough, weathered texture. 8. Emotion or Atmosphere: Majestic, powerful, and serene. 9. Unique Elements: A subtle wind effect in the grass and mane, adding movement to the scene. IMPORTANT : DO NOT Include information about the overall style or artistic technique."
}
def process_image_base64(self, IMAGE, OLLAMA_CONFIG=None, output_selection=6, process_below_output_selection=True):
if OLLAMA_CONFIG is None:
OLLAMA_CONFIG = {
"model": "moondream",
"url": "http://0.0.0.0:11434"
}
selected_model = OLLAMA_CONFIG["model"]
ollama_url = OLLAMA_CONFIG["url"]
images_base64 = []
for img in IMAGE:
# Convert tensor to numpy array
numpy_img = (255. * img.cpu().numpy()).clip(0, 255).astype(np.uint8)
# Create PIL Image
pil_image = Image.fromarray(numpy_img)
# Create a BytesIO object
buffered = BytesIO()
# Save the image into the BytesIO object
pil_image.save(buffered, format="PNG")
# Get the byte value and encode to base64
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
images_base64.append(img_str)
# Clean up
buffered.close()
# Initialize client
client = Client(host=ollama_url)
# Get prompts
prompts = list(self.create_prompts().items())
# Process outputs based on selection and process_below_output_selection flag
responses = []
for i in range(8): # Always prepare 5 slots for output
if process_below_output_selection:
# Process all outputs up to output_selection
if i < output_selection:
prompt_type, prompt = prompts[i]
response = client.generate(
model=selected_model,
prompt=prompt,
images=images_base64
)
responses.append(response['response'].strip())
else:
responses.append("")
else:
# Process only the selected output (output_selection - 1)
if i == (output_selection - 1):
prompt_type, prompt = prompts[i]
response = client.generate(
model=selected_model,
prompt=prompt,
images=images_base64
)
responses.append(response['response'].strip())
else:
responses.append("")
return tuple(responses)
def handle_error(self, error_message: str) -> tuple:
"""Handle errors by returning appropriate error messages for all outputs"""
error_response = f"Error: {error_message}"
return tuple([error_response] * 4)