Detect Keyboard from Webcam Image
This example demonstrates how to use Tzara, the Telekinesis Physical AI Agent, to build a computer vision pipeline that detects a keyboard from a webcam feed. Tzara generates code that captures an image from the webcam, runs a pre-trained YOLOx model for object detection, and outputs bounding boxes and annotations for each detected keyboard.
The Natural Language Instruction
The natural language instruction for this task is:
Capture an image with webcam and detect the keyboardsThe Generated Code
The generated code captures an image from the webcam, detects keyboards using YOLOx, and prints the bounding boxes and confidence scores for each detected keyboard.
python
# Pipeline: Capture image from webcam, then detect keyboards using YOLOX object detection
# Import required modules
from telekinesis.medulla.cameras.webcam import Webcam
from telekinesis import retina
from datatypes import datatypes
# Step 1: Initialize the webcam with a unique name and default camera ID (0)
cam = Webcam(name="webcam_0", camera_id=0)
# Step 2: Connect to the webcam
cam.connect()
# Step 3: Capture a single color frame from the webcam
# Returns a numpy array in RGB format (H, W, 3)
frame = cam.capture_single_color_frame()
# Step 4: Disconnect the webcam after capturing the frame
cam.disconnect()
# Step 5: Wrap the captured frame in a datatypes.Image object
image = datatypes.Image(image=frame)
# Step 6: Run object detection using YOLOX to detect all objects in the image
# YOLOX detects COCO classes which includes "keyboard" (class id 66)
annotations, categories = retina.detect_objects_using_yolox(
image=image,
score_threshold=0.25,
nms_threshold=0.45,
)
# Step 7: Filter detections to keep only "keyboard" detections
# Find the category id for "keyboard" from the returned categories
keyboard_category_ids = []
if hasattr(categories, 'categories') and categories.categories is not None:
for cat in categories.categories:
if isinstance(cat, dict):
if cat.get("name", "").lower() == "keyboard":
keyboard_category_ids.append(cat["id"])
else:
if hasattr(cat, "name") and cat.name.lower() == "keyboard":
keyboard_category_ids.append(cat.id)
# Step 8: Filter annotations to only include keyboard detections
keyboard_detections = []
if hasattr(annotations, 'annotations') and annotations.annotations is not None:
for ann in annotations.annotations:
if isinstance(ann, dict):
if ann.get("category_id") in keyboard_category_ids:
keyboard_detections.append(ann)
else:
if hasattr(ann, "category_id") and ann.category_id in keyboard_category_ids:
keyboard_detections.append(ann)
# Step 9: Print results
print(f"Detected {len(keyboard_detections)} keyboard(s) in the captured image.")
for i, det in enumerate(keyboard_detections):
if isinstance(det, dict):
print(f" Keyboard {i+1}: bbox={det.get('bbox')}, score={det.get('score')}")
else:
print(f" Keyboard {i+1}: bbox={getattr(det, 'bbox', None)}, score={getattr(det, 'score', None)}")
