Skip to content

Detect Silver Glasses from Webcam Image

This example demonstrates how to use Tzara, the Telekinesis Physical AI Agent, to build a computer vision pipeline that detects silver glasses from a webcam feed. Tzara generates code that captures an image from the webcam using OpenCV, runs a pre-trained Qwen VLM model for object detection, and outputs bounding boxes and visualizations with Rerun.

The Natural Language Instruction

The natural language instruction for this task is:

Capture an image and detect the silver glasses in the image and visualize

The Generated Code

The generated code captures an image from the webcam, detects silver glasses using Qwen VLM, and prints the bounding boxes and confidence scores for each detected pair of glasses.

python
# Pipeline: Capture image from webcam, detect silver glasses, visualize detections
# Steps:
# 1. Import required modules
# 2. Connect to webcam and capture a frame
# 3. Detect objects using YOLOX (or Qwen VLM for specific object like "silver glasses")
# 4. Visualize the detections on the image

import numpy as np
import cv2
from datatypes import datatypes
from telekinesis.medulla.cameras.webcam import Webcam
from telekinesis import retina, pupil

# Step 1: Initialize and connect to the webcam
webcam = Webcam(name="main_webcam", camera_id=0)
webcam.connect()

# Step 2: Capture a single color frame from the webcam
frame = webcam.capture_single_color_frame()

# Step 3: Disconnect the webcam after capturing
webcam.disconnect()

# Step 4: Wrap the captured frame in a datatypes.Image
image = datatypes.Image(image=frame)

# Step 5: Use Qwen VLM to detect "silver glasses" specifically
# retina.detect_objects_using_qwen is best suited for specific/unusual object descriptions
annotations = retina.detect_objects_using_qwen(
    image=image,
    objects_to_detect="silver glasses",
    model_name="Qwen/Qwen2.5-VL-7B-Instruct",
)

# Step 6: Visualize the detections on the captured frame
# Convert the image to a numpy array for OpenCV drawing
vis_frame = frame.copy()

# Draw bounding boxes for each detected annotation
for ann in annotations.annotations:
    # bbox is in XYWH format: [x, y, w, h]
    bbox = ann.get("bbox", [0, 0, 0, 0])
    x, y, w, h = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
    score = ann.get("score", None)

    # Draw rectangle (green box)
    cv2.rectangle(vis_frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

    # Draw label with score if available
    label = "silver glasses"
    if score is not None:
        label = f"{label}: {score:.2f}"
    cv2.putText(
        vis_frame,
        label,
        (x, max(y - 10, 0)),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.6,
        (0, 255, 0),
        2,
    )

# Step 7: Display the result
# Convert RGB to BGR for OpenCV display
vis_bgr = cv2.cvtColor(vis_frame, cv2.COLOR_RGB2BGR)
cv2.imshow("Silver Glasses Detection", vis_bgr)
cv2.waitKey(0)
cv2.destroyAllWindows()

# Step 8: Optionally save the result
output_path = "silver_glasses_detection.png"
cv2.imwrite(output_path, vis_bgr)
print(f"Detection result saved to: {output_path}")
print(f"Number of detections: {len(annotations.annotations)}")