So erstellen Sie mit MolmoWeb-4B einen visionsgesteuerten Net-KI-Agenten unter Verwendung von multimodalem Denken und Aktionsvorhersage

def parse_click_coords(action_str):
   """
   Extract normalised (x, y) coordinates from a click on motion string.
   e.g., 'click on(0.45, 0.32)' -> (0.45, 0.32)
   Returns None if the motion is just not a click on.
   """
   match = re.search(r"click on(s*((d.)+)s*,s*((d.)+)s*)", action_str)
   if match:
       return float(match.group(1)), float(match.group(2))
   return None




def parse_action_details(action_str):
   """
   Parse a MolmoWeb motion string right into a structured dict.
   Returns:  {"kind": "click on", "x": 0.45, "y": 0.32}
             {"kind": "goto", "url": "https://..."}
             {"kind": "kind", "textual content": "question textual content"}
             {"kind": "scroll", "path": "down"}
             {"kind": "press", "key": "Enter"}
             {"kind": "send_msg", "message": "The reply is ..."}
             {"kind": "unknown", "uncooked": "..."}
   """
   action_str = action_str.strip()


   m = re.match(r'click on(s*((d.)+)s*,s*((d.)+)s*)', action_str)
   if m:
       return {"kind": "click on", "x": float(m.group(1)), "y": float(m.group(2))}


   m = re.match(r'goto(s*("')(.+?)("')s*)', action_str)
   if m:
       return {"kind": "goto", "url": m.group(1)}


   m = re.match(r'kind(s*("')(.+?)("')s*)', action_str)
   if m:
       return {"kind": "kind", "textual content": m.group(1)}


   m = re.match(r'scroll(s*("')?(up|down)("')?s*)', action_str)
   if m:
       return {"kind": "scroll", "path": m.group(1)}


   m = re.match(r'press(s*("')(.+?)("')s*)', action_str)
   if m:
       return {"kind": "press", "key": m.group(1)}


   m = re.match(r'send_msg(s*("')(.+?)("')s*)', action_str, re.DOTALL)
   if m:
       return {"kind": "send_msg", "message": m.group(1)}


   m = re.match(r'(new_tab|go_back|switch_tab)(s*(d*)s*)', action_str)
   if m:
       end result = {"kind": m.group(1)}
       if m.group(2):
           end result("tab") = int(m.group(2))
       return end result


   return {"kind": "unknown", "uncooked": action_str}




def visualise_click(picture, action_str, title="MolmoWeb Prediction"):
   """
   Draw the expected click on location on the screenshot and show it.
   Coordinates are normalised (0-1); we convert to pixel area.
   """
   coords = parse_click_coords(action_str)


   fig, ax = plt.subplots(1, 1, figsize=(12, 7))
   ax.imshow(picture)
   ax.set_title(title, fontsize=14)


   if coords:
       x_norm, y_norm = coords
       w, h = picture.dimension
       x_px, y_px = x_norm * w, y_norm * h


       circle = patches.Circle(
           (x_px, y_px), radius=18, linewidth=3,
           edgecolor="crimson", facecolor="none"
       )
       ax.add_patch(circle)
       ax.plot(x_px, y_px, "r+", markersize=20, markeredgewidth=3)


       ax.annotate(
           f"click on({x_norm:.3f}, {y_norm:.3f})",
           (x_px, y_px), xytext=(x_px + 25, y_px - 25),
           fontsize=11, colour="white",
           bbox=dict(boxstyle="spherical,pad=0.3", facecolor="crimson", alpha=0.8),
           arrowprops=dict(arrowstyle="->", colour="crimson", lw=2),
       )
   else:
       ax.textual content(
           0.5, 0.02, f"Motion: {action_str}", rework=ax.transAxes,
           fontsize=12, ha="middle", colour="white",
           bbox=dict(boxstyle="spherical,pad=0.4", facecolor="blue", alpha=0.8),
       )


   ax.axis("off")
   plt.tight_layout()
   plt.present()




def download_image(url, dimension=(1280, 720)):
   """Obtain a picture from a URL and resize to browser viewport dimensions."""
   response = requests.get(url, timeout=15)
   img = Picture.open(BytesIO(response.content material)).convert("RGB")
   img = img.resize(dimension, Picture.LANCZOS)
   return img




def create_synthetic_webpage(title="Instance Web page", parts=None):
   """
   Create an artificial webpage screenshot for testing.
   'parts' is an inventory of dicts: "hyperlink",
                                    "textual content": str, "pos": (x, y)
   """
   img = Picture.new("RGB", (1280, 720), colour=(255, 255, 255))
   draw = ImageDraw.Draw(img)


   draw.rectangle((0, 0, 1280, 50), fill=(240, 240, 240))
   draw.rectangle((180, 10, 900, 40), define=(200, 200, 200), width=1, fill="white")
   draw.textual content((200, 16), f"https://www.instance.com", fill=(100, 100, 100))


   for cx in (30, 60, 90):
       draw.ellipse((cx - 8, 17, cx + 8, 33), fill=(200, 200, 200))


   draw.textual content((50, 70), title, fill="black")


   if parts:
       for el in parts:
           x, y = el("pos")
           if el("kind") == "button":
               draw.rectangle((x, y, x + 150, y + 35), fill=(66, 133, 244))
               draw.textual content((x + 10, y + 8), el("textual content"), fill="white")
           elif el("kind") == "enter":
               draw.rectangle((x, y, x + 300, y + 35), define=(180, 180, 180), width=2)
               draw.textual content((x + 10, y + 8), el("textual content"), fill=(150, 150, 150))
           elif el("kind") == "textual content":
               draw.textual content((x, y), el("textual content"), fill="black")
           elif el("kind") == "hyperlink":
               draw.textual content((x, y), el("textual content"), fill=(66, 133, 244))


   return img




print("Helper features outlined efficiently.")




print("n" + "=" * 70)
print("SECTION 5: Single-step inference - clean web page (chilly begin)")
print("=" * 70)
print("The agent begins at about:clean and should determine its first motion.n")


blank_image = Picture.new("RGB", (1280, 720), colour="white")


process = "Go to arxiv.org and discover the most recent paper about Molmo from Ai2"


immediate = build_prompt(
   task_description=process,
   page_url="about:clean",
   page_index=0,
)


print(f"Activity: {process}")
print("Screenshot: clean white picture (about:clean)")
print("Operating inference...n")


raw_output = run_inference(immediate, blank_image)


print(f"Uncooked mannequin output:n{raw_output}n")


parsed = parse_thought_and_action(raw_output)
print(f"Thought: {parsed('thought')}")
print(f"Motion:  {parsed('motion')}")


action_details = parse_action_details(parsed("motion"))
print(f"Parsed:  {action_details}")

So erstellen Sie mit MolmoWeb-4B einen visionsgesteuerten Net-KI-Agenten unter Verwendung von multimodalem Denken und Aktionsvorhersage

Von admin

Schreibe einen Kommentar Antworten abbrechen

Versäumt

RIP OpenClaw? Treffen Sie Claude Dispatch

So erstellen Sie mit MolmoWeb-4B einen visionsgesteuerten Net-KI-Agenten unter Verwendung von multimodalem Denken und Aktionsvorhersage

5 Branchen, die das Wachstum der Massive-Knowledge-Technologie vorantreiben

Vibe Coding eines privaten KI-Finanzanalysten mit Python und lokalen LLMs

About

Categories

Tags

Recent Post

RIP OpenClaw? Treffen Sie Claude Dispatch

So erstellen Sie mit MolmoWeb-4B einen visionsgesteuerten Net-KI-Agenten unter Verwendung von multimodalem Denken und Aktionsvorhersage

Von admin

Ähnlicher Beitrag

Schreibe einen Kommentar Antworten abbrechen

Versäumt