Detect Keyboard from Webcam Image
This example demonstrates how to use Tzara, the Telekinesis Physical AI Agent, to build a computer vision pipeline that detects a keyboard from a webcam feed. Tzara generates code that captures an image from the webcam, runs a pre-trained YOLOx model for object detection, and outputs bounding boxes and annotations for each detected keyboard.
Keyboard detected with YOLOX — bounding box and confidence score
The Natural Language Instruction
The natural language instruction for this task is:
Capture an image with webcam and detect the keyboardsThe Generated Code
The generated code captures an image from the webcam, detects keyboards using YOLOx, and prints the bounding boxes and confidence scores for each detected keyboard.
python
# Pipeline: Capture image from webcam, then detect keyboards using YOLOX object detection
from telekinesis.medulla.cameras.webcam import Webcam
from telekinesis import retina
from datatypes import datatypes
cam = Webcam(name="webcam_0", camera_id=0)
cam.connect()
try:
# Step 1: Capture a single color frame from the webcam
frame = cam.capture_color_image()
# Step 2: Wrap the frame and run YOLOX detection
image = datatypes.Image(image=frame)
annotations, categories = retina.detect_objects_using_yolox(
image=image,
score_threshold=0.25,
nms_threshold=0.45,
)
# Step 3: Filter to "keyboard" detections
keyboard_category_ids = []
if hasattr(categories, 'categories') and categories.categories is not None:
for cat in categories.categories:
if isinstance(cat, dict):
if cat.get("name", "").lower() == "keyboard":
keyboard_category_ids.append(cat["id"])
else:
if hasattr(cat, "name") and cat.name.lower() == "keyboard":
keyboard_category_ids.append(cat.id)
keyboard_detections = []
if hasattr(annotations, 'annotations') and annotations.annotations is not None:
for ann in annotations.annotations:
if isinstance(ann, dict):
if ann.get("category_id") in keyboard_category_ids:
keyboard_detections.append(ann)
else:
if hasattr(ann, "category_id") and ann.category_id in keyboard_category_ids:
keyboard_detections.append(ann)
# Step 4: Print results
print(f"Detected {len(keyboard_detections)} keyboard(s) in the captured image.")
for i, det in enumerate(keyboard_detections):
if isinstance(det, dict):
print(f" Keyboard {i+1}: bbox={det.get('bbox')}, score={det.get('score')}")
else:
print(f" Keyboard {i+1}: bbox={getattr(det, 'bbox', None)}, score={getattr(det, 'score', None)}")
finally:
cam.disconnect()

