Detect Cups from Multi Prompts
This example demonstrates how to use Tzara, the Telekinesis Physical AI Agent, to build a computer vision pipeline that detects cups from a webcam feed using multiple prompts. Tzara generates code that captures an image from the webcam using OpenCV, runs a pre-trained object detection model with multiple prompts for "cup", and outputs bounding boxes and visualizations with Rerun.
Cup detected with YOLOX — bounding box and confidence score
The Natural Language Instruction: Prompt 1
The natural language instruction for this task is:
capture an image with a webcam and detect the cup in the imageThe Generated Code for Prompt 1
The generated code captures an image from the webcam, detects cups using an object detection model, and prints the bounding boxes and confidence scores for each detected cup.
# Pipeline: Capture image from webcam and detect cup using object detection
# Steps:
# 1. Import required modules
# 2. Initialize and connect to webcam
# 3. Capture a single frame
# 4. Run object detection using YOLOX to find the cup
# 5. Print/display the detection results
from telekinesis.medulla.cameras.webcam import Webcam
from telekinesis import retina
import numpy as np
cam = Webcam(name="webcam_0", camera_id=0)
cam.connect()
try:
# Step 1: Capture a single color frame from the webcam
frame = cam.capture_color_image()
# Step 2: Detect objects using YOLOX (COCO includes 'cup')
annotations, categories = retina.detect_objects_using_yolox(
image=frame,
score_threshold=0.25,
nms_threshold=0.45,
)
# Step 3: Filter and print cup detections
category_id_to_name = {}
if hasattr(categories, 'categories') and categories.categories is not None:
for cat in categories.categories:
if isinstance(cat, dict):
category_id_to_name[cat['id']] = cat['name']
else:
category_id_to_name[cat.id] = cat.name
print("Detection results:")
cup_detections = []
if hasattr(annotations, 'annotations') and annotations.annotations is not None:
for ann in annotations.annotations:
if isinstance(ann, dict):
cat_id = ann.get('category_id')
cat_name = category_id_to_name.get(cat_id, "unknown")
score = ann.get('score', None)
bbox = ann.get('bbox', None)
else:
cat_id = ann.category_id
cat_name = category_id_to_name.get(cat_id, "unknown")
score = ann.score
bbox = ann.bbox
if cat_name.lower() == 'cup':
cup_detections.append({'category': cat_name, 'score': score, 'bbox': bbox})
print(f" Cup detected! Score: {score:.2f}, BBox: {bbox}")
if not cup_detections:
print(" No cup detected in the image.")
else:
print(f" Total cups detected: {len(cup_detections)}")
finally:
cam.disconnect()The Natural Language Instruction: Prompt 2
To add visualization of the detected cups, we can prompt Tzara with:
Could you add visualization using OpenCVThe Generated Code for Prompt 2
The generated code adds visualization of the detected cups using OpenCV to draw bounding boxes around them and display the image.
# Pipeline: Capture image from webcam, detect cup, draw bounding boxes, display result
import cv2
import numpy as np
from telekinesis.medulla.cameras.webcam import Webcam
from telekinesis import retina
from datatypes import datatypes
cam = Webcam(name="webcam_0", camera_id=0)
cam.connect()
try:
frame_rgb = cam.capture_color_image()
annotations, categories = retina.detect_objects_using_yolox(
image=frame_rgb,
score_threshold=0.25,
nms_threshold=0.45,
)
cat_id_to_name = {}
if categories is not None:
for cat in categories.categories:
cat_id_to_name[cat["id"]] = cat["name"]
frame_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
cup_count = 0
if annotations is not None:
for ann in annotations.annotations:
category_name = cat_id_to_name.get(ann["category_id"], "").lower()
if category_name != "cup":
continue
cup_count += 1
bbox = ann["bbox"]
x, y, w, h = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
score = ann.get("score", None)
cv2.rectangle(frame_bgr, (x, y), (x + w, y + h), (0, 255, 0), 2)
label = f"cup: {score:.2f}" if score is not None else "cup"
(text_w, text_h), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
cv2.rectangle(frame_bgr, (x, y - text_h - baseline - 4), (x + text_w, y), (0, 255, 0), -1)
cv2.putText(frame_bgr, label, (x, y - baseline - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 2)
print(f"Detected {cup_count} cup(s) in the image.")
cv2.imshow("Cup Detection", frame_bgr)
cv2.waitKey(0)
finally:
cam.disconnect()
cv2.destroyAllWindows()Natural Language Instruction: Prompt 3
To save the visualization result, we can further prompt Tzara with:
Also save the imageThe Generated Code for Prompt 3
The generated code saves the annotated image with detected cups to disk.
'''
STARTED FROM capture_detect_cup_and_visualize.py
Prompt: Also save the image
'''
# Pipeline: Capture image from webcam, detect cup, draw bounding boxes, display and save
import numpy as np
import cv2
from datatypes import datatypes
from telekinesis.medulla.cameras.webcam import Webcam
from telekinesis import retina
# ── Step 1: Connect to the webcam ──────────────────────────────────────────────
cam = Webcam(name="webcam_0", camera_id=0)
cam.connect()
try:
# ── Step 2: Capture a single color frame ───────────────────────────────────────
frame_np = cam.capture_color_image() # Returns RGB numpy array (H, W, 3)
# ── Step 3: Detect objects using YOLOX ─────────────────────────────────────────
# detect_objects_using_yolox returns (ObjectDetectionAnnotations, Categories)
annotations, categories = retina.detect_objects_using_yolox(
image=frame_np,
score_threshold=0.25,
nms_threshold=0.45,
)
# ── Step 4: Filter detections to only keep "cup" ───────────────────────────────
# Build a mapping from category id -> category name using the Categories object
cat_id_to_name = {}
if categories is not None and hasattr(categories, "categories"):
for cat in categories.categories:
cat_id_to_name[cat["id"]] = cat["name"]
# ── Step 5: Draw bounding boxes on the image ───────────────────────────────────
# Work on a BGR copy for OpenCV drawing
annotated_image = cv2.cvtColor(frame_np, cv2.COLOR_RGB2BGR)
cup_color = (0, 255, 0) # Green bounding box for cups
text_color = (0, 255, 0)
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.7
thickness = 2
if annotations is not None and hasattr(annotations, "annotations"):
for ann in annotations.annotations:
category_id = ann.get("category_id", -1)
category_name = cat_id_to_name.get(category_id, "unknown")
# Only draw boxes for "cup"
if category_name.lower() != "cup":
continue
# bbox is in XYWH format (COCO standard)
bbox = ann.get("bbox", [0, 0, 0, 0])
x, y, w, h = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
score = ann.get("score", None)
# Draw rectangle
cv2.rectangle(annotated_image, (x, y), (x + w, y + h), cup_color, thickness)
# Build label string
label = f"cup"
if score is not None:
label += f" {score:.2f}"
# Draw label background and text
(text_w, text_h), baseline = cv2.getTextSize(label, font, font_scale, thickness)
cv2.rectangle(
annotated_image,
(x, y - text_h - baseline - 4),
(x + text_w, y),
cup_color,
cv2.FILLED,
)
cv2.putText(
annotated_image,
label,
(x, y - baseline - 2),
font,
font_scale,
(0, 0, 0), # Black text on green background
thickness,
cv2.LINE_AA,
)
# ── Step 6: Display the annotated image ────────────────────────────────────────
cv2.imshow("Cup Detection", annotated_image)
cv2.waitKey(0)
# ── Step 7: Save the annotated image to disk ───────────────────────────────────
output_path = "cup_detection_result.jpg"
cv2.imwrite(output_path, annotated_image)
print(f"Annotated image saved to: {output_path}")
finally:
cam.disconnect()
cv2.destroyAllWindows()

