feat: add goto_page action handler and dev changes

new observer thoughts (Skyvern-AI#1442) add workflow run block screenshots (Skyvern-AI#1443) add workflow run block screenshot and observer thought screenshots (Skyvern-AI#1444) do not show metadata thought yet (Skyvern-AI#1445) chore: remove access keys from docker compose
prathamesh-88 · Dec 29, 2024 · d3414c9 · d3414c9
1 parent d03957d
commit d3414c9
Show file tree

Hide file tree

Showing 24 changed files with 891 additions and 189 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -14,7 +14,8 @@ RUN playwright install-deps
 RUN playwright install
 RUN apt-get install -y xauth x11-apps netpbm && apt-get clean
 
-COPY . /app
+# Commented out for local development
+# COPY . /app
 
 ENV PYTHONPATH="/app:$PYTHONPATH"
 ENV VIDEO_PATH=/data/videos

diff --git a/alembic/versions/2024_12_27_1610-d13af1e466fa_new_observer_thoughts.py b/alembic/versions/2024_12_27_1610-d13af1e466fa_new_observer_thoughts.py
@@ -0,0 +1,35 @@
+"""new observer thoughts
+
+Revision ID: d13af1e466fa
+Revises: 835522a23b19
+Create Date: 2024-12-27 16:10:36.555540+00:00
+
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "d13af1e466fa"
+down_revision: Union[str, None] = "835522a23b19"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column("observer_thoughts", sa.Column("observer_thought_type", sa.String(), nullable=True))
+    op.add_column("observer_thoughts", sa.Column("observer_thought_scenario", sa.String(), nullable=True))
+    op.add_column("observer_thoughts", sa.Column("output", sa.JSON(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("observer_thoughts", "output")
+    op.drop_column("observer_thoughts", "observer_thought_scenario")
+    op.drop_column("observer_thoughts", "observer_thought_type")
+    # ### end Alembic commands ###
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,53 +1,61 @@
 services:
-  postgres:
-    image: postgres:14-alpine
-    restart: always
-    # comment out if you want to externally connect DB
-    # ports:
-    #   - 5432:5432
-    volumes:
-      - ./postgres-data:/var/lib/postgresql/data
-    environment:
-      - PGDATA=/var/lib/postgresql/data/pgdata
-      - POSTGRES_USER=skyvern
-      - POSTGRES_PASSWORD=skyvern
-      - POSTGRES_POSTGRES_DB=skyvern
-    healthcheck:
-      test: ["CMD-SHELL", "pg_isready -U skyvern"]
-      interval: 5s
-      timeout: 5s
-      retries: 5
+  # uncomment if you want to use postgres
+  # postgres:
+  #   image: postgres:14-alpine
+  #   restart: always
+  #   # comment out if you want to externally connect DB
+  #   # ports:
+  #   #   - 5432:5432
+  #   volumes:
+  #     - ./postgres-data:/var/lib/postgresql/data
+  #   environment:
+  #     - PGDATA=/var/lib/postgresql/data/pgdata
+  #     - POSTGRES_USER=skyvern
+  #     - POSTGRES_PASSWORD=skyvern
+  #     - POSTGRES_POSTGRES_DB=skyvern
+  #   healthcheck:
+  #     test: ["CMD-SHELL", "pg_isready -U skyvern"]
+  #     interval: 5s
+  #     timeout: 5s
+  #     retries: 5
 
   skyvern:
-    image: public.ecr.aws/skyvern/skyvern:latest
+    build:
+      dockerfile: Dockerfile
     restart: on-failure
     # comment out if you want to externally call skyvern API
     ports:
       - 8000:8000
     volumes:
+      - ./:/app
       - ./artifacts:/data/artifacts
       - ./videos:/data/videos
       - ./har:/data/har
       - ./log:/data/log
       - ./.streamlit:/app/.streamlit
     environment:
-      - DATABASE_STRING=postgresql+psycopg://skyvern:skyvern@postgres:5432/skyvern
+      - DATABASE_STRING=${SKYVERN_DATABASE_STRING}
       - BROWSER_TYPE=chromium-headful
       - ENABLE_OPENAI=true
-      - OPENAI_API_KEY=<your_openai_key>
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
+      - LLM_KEY=OPENAI_GPT4O
+      # - AWS_DEFAULT_REGION=us-east-1
+      # - AWS_ACCESS_KEY_ID=<your_aws_access_key_id>
+      # - AWS_SECRET_ACCESS_KEY=<your_aws_secret_access_key>
+      # - SKYVERN_STORAGE_TYPE=s3
       # If you want to use other LLM provider, like azure and anthropic:
       # - ENABLE_ANTHROPIC=true
-      # - LLM_KEY=ANTHROPIC_CLAUDE3_OPUS
-      # - ANTHROPIC_API_KEY=<your_anthropic_key>
+      # - LLM_KEY=ANTHROPIC_CLAUDE3
+      # - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
       # - ENABLE_AZURE=true
       # - LLM_KEY=AZURE_OPENAI
       # - AZURE_DEPLOYMENT=<your_azure_deployment>
       # - AZURE_API_KEY=<your_azure_api_key>
       # - AZURE_API_BASE=<your_azure_api_base>
       # - AZURE_API_VERSION=<your_azure_api_version>
-    depends_on:
-      postgres:
-        condition: service_healthy
+    # depends_on:
+    #   postgres:
+    #     condition: service_healthy
     healthcheck:
       test: ["CMD", "test", "-f", "/app/.streamlit/secrets.toml"]
       interval: 5s
@@ -68,7 +76,7 @@ services:
     environment:
     # if you want to run skyvern on a remote server,
     # you need to change the host in VITE_WSS_BASE_URL and VITE_API_BASE_URL to match your server ip
-      - VITE_WSS_BASE_URL=ws://localhost:8000/api/v1
+      - VITE_WSS_BASE_URL=ws://host.docker.internal:8000/api/v1
     #   - VITE_API_BASE_URL=http://localhost:8000/api/v1
     #   - VITE_SKYVERN_API_KEY=
     depends_on:

diff --git a/skyvern-frontend/src/api/types.ts b/skyvern-frontend/src/api/types.ts
@@ -139,6 +139,7 @@ export const ActionTypes = {
   Click: "click",
   SelectOption: "select_option",
   UploadFile: "upload_file",
+  GotoPage: "goto_page",
   complete: "complete",
   wait: "wait",
   terminate: "terminate",
@@ -154,6 +155,7 @@ export const ReadableActionTypes: {
   click: "Click",
   select_option: "Select Option",
   upload_file: "Upload File",
+  goto_page: "Goto Page",
   complete: "Complete",
   wait: "Wait",
   terminate: "Terminate",

diff --git a/skyvern-frontend/src/routes/workflows/workflowRun/ObserverThoughtScreenshot.tsx b/skyvern-frontend/src/routes/workflows/workflowRun/ObserverThoughtScreenshot.tsx
@@ -61,7 +61,7 @@ function ObserverThoughtScreenshot({ observerThoughtId, taskStatus }: Props) {
   if (!screenshot) {
     return (
       <div className="flex h-full items-center justify-center bg-slate-elevation1">
-        No screenshot found for this action.
+        No screenshot found for this thought.
       </div>
     );
   }

diff --git a/skyvern-frontend/src/routes/workflows/workflowRun/WorkflowRunBlockScreenshot.tsx b/skyvern-frontend/src/routes/workflows/workflowRun/WorkflowRunBlockScreenshot.tsx
@@ -0,0 +1,66 @@
+import { getClient } from "@/api/AxiosClient";
+import { ArtifactApiResponse, ArtifactType } from "@/api/types";
+import { ZoomableImage } from "@/components/ZoomableImage";
+import { useCredentialGetter } from "@/hooks/useCredentialGetter";
+import { useQuery } from "@tanstack/react-query";
+import { ReloadIcon } from "@radix-ui/react-icons";
+import { getImageURL } from "@/routes/tasks/detail/artifactUtils";
+
+type Props = {
+  workflowRunBlockId: string;
+};
+
+function WorkflowRunBlockScreenshot({ workflowRunBlockId }: Props) {
+  const credentialGetter = useCredentialGetter();
+
+  const { data: artifacts, isLoading } = useQuery<Array<ArtifactApiResponse>>({
+    queryKey: ["workflowRunBlock", workflowRunBlockId, "artifacts"],
+    queryFn: async () => {
+      const client = await getClient(credentialGetter);
+      return client
+        .get(`/workflow_run_block/${workflowRunBlockId}/artifacts`)
+        .then((response) => response.data);
+    },
+    refetchInterval: (query) => {
+      const data = query.state.data;
+      const screenshot = data?.filter(
+        (artifact) => artifact.artifact_type === ArtifactType.LLMScreenshot,
+      )?.[0];
+      if (!screenshot) {
+        return 5000;
+      }
+      return false;
+    },
+  });
+
+  const llmScreenshots = artifacts?.filter(
+    (artifact) => artifact.artifact_type === ArtifactType.LLMScreenshot,
+  );
+
+  const screenshot = llmScreenshots?.[0];
+
+  if (isLoading) {
+    return (
+      <div className="flex h-full items-center justify-center gap-2 bg-slate-elevation1">
+        <ReloadIcon className="h-6 w-6 animate-spin" />
+        <div>Loading screenshot...</div>
+      </div>
+    );
+  }
+
+  if (!screenshot) {
+    return (
+      <div className="flex h-full items-center justify-center bg-slate-elevation1">
+        No screenshot found for this workflow run block.
+      </div>
+    );
+  }
+
+  return (
+    <figure className="mx-auto flex max-w-full flex-col items-center gap-2 overflow-hidden rounded">
+      <ZoomableImage src={getImageURL(screenshot)} alt="llm-screenshot" />
+    </figure>
+  );
+}
+
+export { WorkflowRunBlockScreenshot };
diff --git a/skyvern-frontend/src/routes/workflows/workflowRun/WorkflowRunOverview.tsx b/skyvern-frontend/src/routes/workflows/workflowRun/WorkflowRunOverview.tsx
@@ -23,6 +23,7 @@ import { DotFilledIcon } from "@radix-ui/react-icons";
 import { WorkflowRunTimelineItemInfoSection } from "./WorkflowRunTimelineItemInfoSection";
 import { ObserverThoughtScreenshot } from "./ObserverThoughtScreenshot";
 import { ScrollArea, ScrollAreaViewport } from "@/components/ui/scroll-area";
+import { WorkflowRunBlockScreenshot } from "./WorkflowRunBlockScreenshot";
 
 export type ActionItem = {
   block: WorkflowRunBlock;
@@ -103,9 +104,9 @@ function WorkflowRunOverview() {
             />
           )}
           {isWorkflowRunBlock(selection) && (
-            <div className="flex h-full w-full items-center justify-center bg-slate-elevation1">
-              No screenshot found for this block
-            </div>
+            <WorkflowRunBlockScreenshot
+              workflowRunBlockId={selection.workflow_run_block_id}
+            />
           )}
           {isObserverThought(selection) && (
             <ObserverThoughtScreenshot

diff --git a/skyvern/config.py b/skyvern/config.py
@@ -132,6 +132,7 @@ class Settings(BaseSettings):
     SVG_MAX_LENGTH: int = 100000
 
     ENABLE_LOG_ARTIFACTS: bool = False
+    STREAMING_FILE_BASE_PATH: str = "./temp"
 
     def is_cloud_environment(self) -> bool:
         """

diff --git a/skyvern/forge/prompts/skyvern/extract-action.j2 b/skyvern/forge/prompts/skyvern/extract-action.j2
@@ -18,10 +18,11 @@ Reply in JSON format with the following keys:
         "user_detail_query": str, // Think of this value as a Jeopardy question. Ask the user for the details you need for executing this action. Ask the question even if the details are disclosed in user goal or user details. If it's a text field, ask for the text. If it's a file upload, ask for the file. If it's a dropdown, ask for the relevant information. If you are clicking on something specific, ask about what to click on. If you're downloading a file and you have multiple options, ask the user which one to download. Otherwise, use null. Examples are: "What product ID should I input into the search bar?", "What file should I upload?", "What is the previous insurance provider of the user?", "Which invoice should I download?", "Does the user have any pets?". If the action doesn't require any user details, use null.
         "user_detail_answer": str, // The answer to the `user_detail_query`. The source of this answer can be user goal or user details.
         "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
-        "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE". "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the user goal has been achieved AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the user goal is achieved. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.
+        "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "GOTO_PAGE", "COMPLETE", "TERMINATE". "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "GOTO_PAGE" is used when you want to redirect to other url but the current page does not have any link or action to do to that url."COMPLETE" is used when the user goal has been achieved AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the user goal is achieved. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.
         "id": str, // The id of the element to take action on. The id has to be one from the elements list
         "text": str, // Text for INPUT_TEXT action only
         "file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise.
+        "url": str, // The url to go to if applicable. This field must be present for GOTO_PAGE action only. It should be null otherwise.
         "download": bool, // Can only be true for CLICK actions. If true, the browser will trigger a download by clicking the element. If false, the browser will click the element without triggering a download.
         "option": {  // The option to select for SELECT_OPTION action only. null if not SELECT_OPTION action
             "label": str, // the label of the option if any. MAKE SURE YOU USE THIS LABEL TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION LABEL HERE

diff --git a/skyvern/forge/prompts/skyvern/observer_loop_task_extraction_goal.j2 b/skyvern/forge/prompts/skyvern/observer_loop_task_extraction_goal.j2
@@ -1,6 +1,6 @@
 The user is trying to achieve a goal the web. Now they've decided to go through a list of values and take the same tasks with each variant in the list.
 
-Help to user extract this list of values based on what they want to achieve:
+Help the user extract a list of values based on what they want to achieve:
 ```
 {{ plan }}
-```
+```
diff --git a/skyvern/forge/sdk/artifact/manager.py b/skyvern/forge/sdk/artifact/manager.py
@@ -9,6 +9,7 @@
 from skyvern.forge.sdk.db.id import generate_artifact_id
 from skyvern.forge.sdk.models import Step
 from skyvern.forge.sdk.schemas.observers import ObserverCruise, ObserverThought
+from skyvern.forge.sdk.schemas.workflow_runs import WorkflowRunBlock
 
 LOG = structlog.get_logger(__name__)
 
@@ -151,6 +152,27 @@ async def create_observer_cruise_artifact(
             path=path,
         )
 
+    async def create_workflow_run_block_artifact(
+        self,
+        workflow_run_block: WorkflowRunBlock,
+        artifact_type: ArtifactType,
+        data: bytes | None = None,
+        path: str | None = None,
+    ) -> str:
+        artifact_id = generate_artifact_id()
+        uri = app.STORAGE.build_workflow_run_block_uri(artifact_id, workflow_run_block, artifact_type)
+        return await self._create_artifact(
+            aio_task_primary_key=workflow_run_block.workflow_run_block_id,
+            artifact_id=artifact_id,
+            artifact_type=artifact_type,
+            uri=uri,
+            workflow_run_block_id=workflow_run_block.workflow_run_block_id,
+            workflow_run_id=workflow_run_block.workflow_run_id,
+            organization_id=workflow_run_block.organization_id,
+            data=data,
+            path=path,
+        )
+
     async def create_llm_artifact(
         self,
         data: bytes,

diff --git a/skyvern/forge/sdk/artifact/storage/base.py b/skyvern/forge/sdk/artifact/storage/base.py
@@ -3,6 +3,7 @@
 from skyvern.forge.sdk.artifact.models import Artifact, ArtifactType, LogEntityType
 from skyvern.forge.sdk.models import Step
 from skyvern.forge.sdk.schemas.observers import ObserverCruise, ObserverThought
+from skyvern.forge.sdk.schemas.workflow_runs import WorkflowRunBlock
 
 # TODO: This should be a part of the ArtifactType model
 FILE_EXTENTSION_MAP: dict[ArtifactType, str] = {
@@ -52,6 +53,12 @@ def build_observer_cruise_uri(
     ) -> str:
         pass
 
+    @abstractmethod
+    def build_workflow_run_block_uri(
+        self, artifact_id: str, workflow_run_block: WorkflowRunBlock, artifact_type: ArtifactType
+    ) -> str:
+        pass
+
     @abstractmethod
     async def store_artifact(self, artifact: Artifact, data: bytes) -> None:
         pass

diff --git a/skyvern/forge/sdk/artifact/storage/local.py b/skyvern/forge/sdk/artifact/storage/local.py
@@ -12,6 +12,7 @@
 from skyvern.forge.sdk.artifact.storage.base import FILE_EXTENTSION_MAP, BaseStorage
 from skyvern.forge.sdk.models import Step
 from skyvern.forge.sdk.schemas.observers import ObserverCruise, ObserverThought
+from skyvern.forge.sdk.schemas.workflow_runs import WorkflowRunBlock
 
 LOG = structlog.get_logger()
 
@@ -40,6 +41,12 @@ def build_observer_cruise_uri(
         file_ext = FILE_EXTENTSION_MAP[artifact_type]
         return f"file://{self.artifact_path}/{settings.ENV}/observers/{observer_cruise.observer_cruise_id}/{datetime.utcnow().isoformat()}_{artifact_id}_{artifact_type}.{file_ext}"
 
+    def build_workflow_run_block_uri(
+        self, artifact_id: str, workflow_run_block: WorkflowRunBlock, artifact_type: ArtifactType
+    ) -> str:
+        file_ext = FILE_EXTENTSION_MAP[artifact_type]
+        return f"file://{self.artifact_path}/{settings.ENV}/workflow_runs/{workflow_run_block.workflow_run_id}/{workflow_run_block.workflow_run_block_id}/{datetime.utcnow().isoformat()}_{artifact_id}_{artifact_type}.{file_ext}"
+
     async def store_artifact(self, artifact: Artifact, data: bytes) -> None:
         file_path = None
         try:

diff --git a/skyvern/forge/sdk/artifact/storage/s3.py b/skyvern/forge/sdk/artifact/storage/s3.py
@@ -16,6 +16,7 @@
 from skyvern.forge.sdk.artifact.storage.base import FILE_EXTENTSION_MAP, BaseStorage
 from skyvern.forge.sdk.models import Step
 from skyvern.forge.sdk.schemas.observers import ObserverCruise, ObserverThought
+from skyvern.forge.sdk.schemas.workflow_runs import WorkflowRunBlock
 
 
 class S3Storage(BaseStorage):
@@ -43,6 +44,12 @@ def build_observer_cruise_uri(
         file_ext = FILE_EXTENTSION_MAP[artifact_type]
         return f"s3://{self.bucket}/{settings.ENV}/observers/{observer_cruise.observer_cruise_id}/{datetime.utcnow().isoformat()}_{artifact_id}_{artifact_type}.{file_ext}"
 
+    def build_workflow_run_block_uri(
+        self, artifact_id: str, workflow_run_block: WorkflowRunBlock, artifact_type: ArtifactType
+    ) -> str:
+        file_ext = FILE_EXTENTSION_MAP[artifact_type]
+        return f"s3://{self.bucket}/{settings.ENV}/workflow_runs/{workflow_run_block.workflow_run_id}/{workflow_run_block.workflow_run_block_id}/{datetime.utcnow().isoformat()}_{artifact_id}_{artifact_type}.{file_ext}"
+
     async def store_artifact(self, artifact: Artifact, data: bytes) -> None:
         await self.async_client.upload_file(artifact.uri, data)