Leveraging multimodal models through two different methodologies: MultimodalConversableAgent and VisionCapability.
lmm
optionag2
:
pip install "ag2[lmm]>=0.2.17"
import os
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import autogen
from autogen import Agent, AssistantAgent, ConversableAgent, LLMConfig, UserProxyAgent
from autogen.agentchat.contrib.capabilities.vision_capability import VisionCapability
from autogen.agentchat.contrib.img_utils import pil_to_data_uri
from autogen.agentchat.contrib.multimodal_conversable_agent import MultimodalConversableAgent
from autogen.code_utils import content_str
llm_config_4v = LLMConfig.from_json(path="OAI_CONFIG_LIST", temperature=0.5, max_tokens=300).where(
model=["gpt-4-vision-preview"]
)
gpt4_llm_config = LLMConfig.from_json(path="OAI_CONFIG_LIST", cache_seed=42).where(
model=["gpt-4", "gpt-4-0314", "gpt4", "gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-v0314"]
)
image_agent = MultimodalConversableAgent(
name="image-explainer",
max_consecutive_auto_reply=10,
llm_config=llm_config_4v,
)
user_proxy = autogen.UserProxyAgent(
name="User_proxy",
system_message="A human admin.",
human_input_mode="NEVER", # Try between ALWAYS or NEVER
max_consecutive_auto_reply=0,
code_execution_config={
"use_docker": False
}, # Please set use_docker=True if docker is available to run the generated code. Using docker is safer than running the generated code directly.
)
# Ask the question with an image
user_proxy.initiate_chat(
image_agent,
message="""What's the breed of this dog?
<img https://th.bing.com/th/id/R.422068ce8af4e15b0634fe2540adea7a?rik=y4OcXBE%2fqutDOw&pid=ImgRaw&r=0>.""",
)
# Ask the question with an image
user_proxy.send(
message="""What is this breed?
<img https://th.bing.com/th/id/OIP.29Mi2kJmcHHyQVGe_0NG7QHaEo?pid=ImgDet&rs=1>
Among the breeds, which one barks less?""",
recipient=image_agent,
)
FigureCreator
agent, which contains three child
agents: commander, coder, and critics.
working_dir = "tmp/"
class FigureCreator(ConversableAgent):
def __init__(self, n_iters=2, **kwargs):
"""Initializes a FigureCreator instance.
This agent facilitates the creation of visualizations through a collaborative effort among its child agents: commander, coder, and critics.
Parameters:
- n_iters (int, optional): The number of "improvement" iterations to run. Defaults to 2.
- **kwargs: keyword arguments for the parent AssistantAgent.
"""
super().__init__(**kwargs)
self.register_reply([Agent, None], reply_func=FigureCreator._reply_user, position=0)
self._n_iters = n_iters
def _reply_user(self, messages=None, sender=None, config=None):
if all((messages is None, sender is None)):
error_msg = f"Either {messages=} or {sender=} must be provided."
logger.error(error_msg) # noqa: F821
raise AssertionError(error_msg)
if messages is None:
messages = self._oai_messages[sender]
user_question = messages[-1]["content"]
# Define the agents
commander = AssistantAgent(
name="Commander",
human_input_mode="NEVER",
max_consecutive_auto_reply=10,
system_message="Help me run the code, and tell other agents it is in the <img result.jpg> file location.",
is_termination_msg=lambda x: x.get("content", "").rstrip().endswith("TERMINATE"),
code_execution_config={"last_n_messages": 3, "work_dir": working_dir, "use_docker": False},
llm_config=self.llm_config,
)
critics = MultimodalConversableAgent(
name="Critics",
system_message="""Criticize the input figure. How to replot the figure so it will be better? Find bugs and issues for the figure.
Pay attention to the color, format, and presentation. Keep in mind of the reader-friendliness.
If you think the figures is good enough, then simply say NO_ISSUES""",
llm_config=llm_config_4v,
human_input_mode="NEVER",
max_consecutive_auto_reply=1,
# use_docker=False,
)
coder = AssistantAgent(
name="Coder",
llm_config=self.llm_config,
)
coder.update_system_message(
coder.system_message
+ "ALWAYS save the figure in `result.jpg` file. Tell other agents it is in the <img result.jpg> file location."
)
# Data flow begins
commander.initiate_chat(coder, message=user_question)
img = Image.open(os.path.join(working_dir, "result.jpg"))
plt.imshow(img)
plt.axis("off") # Hide the axes
plt.show()
for i in range(self._n_iters):
commander.send(
message=f"Improve <img {os.path.join(working_dir, 'result.jpg')}>",
recipient=critics,
request_reply=True,
)
feedback = commander._oai_messages[critics][-1]["content"]
if feedback.find("NO_ISSUES") >= 0:
break
commander.send(
message="Here is the feedback to your figure. Please improve! Save the result to `result.jpg`\n"
+ feedback,
recipient=coder,
request_reply=True,
)
img = Image.open(os.path.join(working_dir, "result.jpg"))
plt.imshow(img)
plt.axis("off") # Hide the axes
plt.show()
return True, os.path.join(working_dir, "result.jpg")
creator = FigureCreator(name="Figure Creator~", llm_config=gpt4_llm_config)
user_proxy = autogen.UserProxyAgent(
name="User", human_input_mode="NEVER", max_consecutive_auto_reply=0, code_execution_config={"use_docker": False}
)
user_proxy.initiate_chat(
creator,
message="""
Plot a figure by using the data from:
https://raw.githubusercontent.com/vega/vega/main/docs/data/seattle-weather.csv
I want to show both temperature high and low.
""",
)
agent1 = MultimodalConversableAgent(
name="image-explainer-1",
max_consecutive_auto_reply=10,
llm_config=llm_config_4v,
system_message="Your image description is poetic and engaging.",
)
agent2 = MultimodalConversableAgent(
name="image-explainer-2",
max_consecutive_auto_reply=10,
llm_config=llm_config_4v,
system_message="Your image description is factual and to the point.",
)
user_proxy = autogen.UserProxyAgent(
name="User_proxy",
system_message="Describe image for me.",
human_input_mode="TERMINATE", # Try between ALWAYS, NEVER, and TERMINATE
max_consecutive_auto_reply=10,
code_execution_config={
"use_docker": False
}, # Please set use_docker=True if docker is available to run the generated code. Using docker is safer than running the generated code directly.
)
# We set max_round to 5
groupchat = autogen.GroupChat(agents=[agent1, agent2, user_proxy], messages=[], max_round=5)
vision_capability = VisionCapability(lmm_config=llm_config_4v)
group_chat_manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=gpt4_llm_config)
vision_capability.add_to_agent(group_chat_manager)
rst = user_proxy.initiate_chat(
group_chat_manager,
message="""Write a poet for my image:
<img https://th.bing.com/th/id/R.422068ce8af4e15b0634fe2540adea7a?rik=y4OcXBE%2fqutDOw&pid=ImgRaw&r=0>.""",
)
agent_no_vision = AssistantAgent(name="Regular LLM Agent", llm_config=gpt4_llm_config)
agent_with_vision = AssistantAgent(name="Regular LLM Agent with Vision Capability", llm_config=gpt4_llm_config)
vision_capability = VisionCapability(lmm_config=llm_config_4v)
vision_capability.add_to_agent(agent_with_vision)
user = UserProxyAgent(
name="User",
human_input_mode="NEVER",
max_consecutive_auto_reply=0,
code_execution_config={"use_docker": False},
)
message = """Write a poet for my image:
<img https://th.bing.com/th/id/R.422068ce8af4e15b0634fe2540adea7a?rik=y4OcXBE%2fqutDOw&pid=ImgRaw&r=0>."""
user.send(message=message, recipient=agent_no_vision, request_reply=True)
user.send(message=message, recipient=agent_with_vision, request_reply=True)
def my_description(image_url: str, image_data: Image = None, lmm_client: object = None) -> str:
"""This function takes an image URL and returns the description.
Parameters:
- image_url (str): The URL of the image.
- image_data (PIL.Image): The image data.
- lmm_client (object): The LLM client object.
Returns:
- str: A description of the color of the image.
"""
# Print the arguments for illustration purpose
print("image_url", image_url)
print("image_data", image_data)
print("lmm_client", lmm_client)
img_uri = pil_to_data_uri(image_data) # cast data into URI (str) format for API call
lmm_out = lmm_client.create(
context=None,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image in 10 words."},
{
"type": "image_url",
"image_url": {
"url": img_uri,
},
},
],
}
],
)
description = lmm_out.choices[0].message.content
description = content_str(description)
# Convert the image into an array of pixels.
pixels = np.array(image_data)
# Calculate the average color.
avg_color_per_row = np.mean(pixels, axis=0)
avg_color = np.mean(avg_color_per_row, axis=0)
avg_color = avg_color.astype(int) # Convert to integer for color values
# Format the average color as a string description.
caption = f"""The image is from {image_url}
It is about: {description}
The average color of the image is RGB:
({avg_color[0]}, {avg_color[1]}, {avg_color[2]})"""
print(caption) # For illustration purpose
return caption
agent_with_vision_and_func = AssistantAgent(
name="Regular LLM Agent with Custom Func and LMM", llm_config=gpt4_llm_config
)
vision_capability_with_func = VisionCapability(
lmm_config=llm_config_4v,
custom_caption_func=my_description,
)
vision_capability_with_func.add_to_agent(agent_with_vision_and_func)
user.send(message=message, recipient=agent_with_vision_and_func, request_reply=True)