Skip to content

Commit

Permalink
feat: add goto_page action handler and dev changes
Browse files Browse the repository at this point in the history
new observer thoughts (Skyvern-AI#1442)

add workflow run block screenshots (Skyvern-AI#1443)

add workflow run block screenshot and observer thought screenshots (Skyvern-AI#1444)

do not show metadata thought yet (Skyvern-AI#1445)

chore: remove access keys from docker compose
  • Loading branch information
prathamesh-88 committed Dec 29, 2024
1 parent d03957d commit d3414c9
Show file tree
Hide file tree
Showing 24 changed files with 891 additions and 189 deletions.
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ RUN playwright install-deps
RUN playwright install
RUN apt-get install -y xauth x11-apps netpbm && apt-get clean

COPY . /app
# Commented out for local development
# COPY . /app

ENV PYTHONPATH="/app:$PYTHONPATH"
ENV VIDEO_PATH=/data/videos
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""new observer thoughts
Revision ID: d13af1e466fa
Revises: 835522a23b19
Create Date: 2024-12-27 16:10:36.555540+00:00
"""

from typing import Sequence, Union

import sqlalchemy as sa

from alembic import op

# revision identifiers, used by Alembic.
revision: str = "d13af1e466fa"
down_revision: Union[str, None] = "835522a23b19"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column("observer_thoughts", sa.Column("observer_thought_type", sa.String(), nullable=True))
op.add_column("observer_thoughts", sa.Column("observer_thought_scenario", sa.String(), nullable=True))
op.add_column("observer_thoughts", sa.Column("output", sa.JSON(), nullable=True))
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("observer_thoughts", "output")
op.drop_column("observer_thoughts", "observer_thought_scenario")
op.drop_column("observer_thoughts", "observer_thought_type")
# ### end Alembic commands ###
62 changes: 35 additions & 27 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,53 +1,61 @@
services:
postgres:
image: postgres:14-alpine
restart: always
# comment out if you want to externally connect DB
# ports:
# - 5432:5432
volumes:
- ./postgres-data:/var/lib/postgresql/data
environment:
- PGDATA=/var/lib/postgresql/data/pgdata
- POSTGRES_USER=skyvern
- POSTGRES_PASSWORD=skyvern
- POSTGRES_POSTGRES_DB=skyvern
healthcheck:
test: ["CMD-SHELL", "pg_isready -U skyvern"]
interval: 5s
timeout: 5s
retries: 5
# uncomment if you want to use postgres
# postgres:
# image: postgres:14-alpine
# restart: always
# # comment out if you want to externally connect DB
# # ports:
# # - 5432:5432
# volumes:
# - ./postgres-data:/var/lib/postgresql/data
# environment:
# - PGDATA=/var/lib/postgresql/data/pgdata
# - POSTGRES_USER=skyvern
# - POSTGRES_PASSWORD=skyvern
# - POSTGRES_POSTGRES_DB=skyvern
# healthcheck:
# test: ["CMD-SHELL", "pg_isready -U skyvern"]
# interval: 5s
# timeout: 5s
# retries: 5

skyvern:
image: public.ecr.aws/skyvern/skyvern:latest
build:
dockerfile: Dockerfile
restart: on-failure
# comment out if you want to externally call skyvern API
ports:
- 8000:8000
volumes:
- ./:/app
- ./artifacts:/data/artifacts
- ./videos:/data/videos
- ./har:/data/har
- ./log:/data/log
- ./.streamlit:/app/.streamlit
environment:
- DATABASE_STRING=postgresql+psycopg://skyvern:skyvern@postgres:5432/skyvern
- DATABASE_STRING=${SKYVERN_DATABASE_STRING}
- BROWSER_TYPE=chromium-headful
- ENABLE_OPENAI=true
- OPENAI_API_KEY=<your_openai_key>
- OPENAI_API_KEY=${OPENAI_API_KEY}
- LLM_KEY=OPENAI_GPT4O
# - AWS_DEFAULT_REGION=us-east-1
# - AWS_ACCESS_KEY_ID=<your_aws_access_key_id>
# - AWS_SECRET_ACCESS_KEY=<your_aws_secret_access_key>
# - SKYVERN_STORAGE_TYPE=s3
# If you want to use other LLM provider, like azure and anthropic:
# - ENABLE_ANTHROPIC=true
# - LLM_KEY=ANTHROPIC_CLAUDE3_OPUS
# - ANTHROPIC_API_KEY=<your_anthropic_key>
# - LLM_KEY=ANTHROPIC_CLAUDE3
# - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
# - ENABLE_AZURE=true
# - LLM_KEY=AZURE_OPENAI
# - AZURE_DEPLOYMENT=<your_azure_deployment>
# - AZURE_API_KEY=<your_azure_api_key>
# - AZURE_API_BASE=<your_azure_api_base>
# - AZURE_API_VERSION=<your_azure_api_version>
depends_on:
postgres:
condition: service_healthy
# depends_on:
# postgres:
# condition: service_healthy
healthcheck:
test: ["CMD", "test", "-f", "/app/.streamlit/secrets.toml"]
interval: 5s
Expand All @@ -68,7 +76,7 @@ services:
environment:
# if you want to run skyvern on a remote server,
# you need to change the host in VITE_WSS_BASE_URL and VITE_API_BASE_URL to match your server ip
- VITE_WSS_BASE_URL=ws://localhost:8000/api/v1
- VITE_WSS_BASE_URL=ws://host.docker.internal:8000/api/v1
# - VITE_API_BASE_URL=http://localhost:8000/api/v1
# - VITE_SKYVERN_API_KEY=
depends_on:
Expand Down
2 changes: 2 additions & 0 deletions skyvern-frontend/src/api/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ export const ActionTypes = {
Click: "click",
SelectOption: "select_option",
UploadFile: "upload_file",
GotoPage: "goto_page",
complete: "complete",
wait: "wait",
terminate: "terminate",
Expand All @@ -154,6 +155,7 @@ export const ReadableActionTypes: {
click: "Click",
select_option: "Select Option",
upload_file: "Upload File",
goto_page: "Goto Page",
complete: "Complete",
wait: "Wait",
terminate: "Terminate",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ function ObserverThoughtScreenshot({ observerThoughtId, taskStatus }: Props) {
if (!screenshot) {
return (
<div className="flex h-full items-center justify-center bg-slate-elevation1">
No screenshot found for this action.
No screenshot found for this thought.
</div>
);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import { getClient } from "@/api/AxiosClient";
import { ArtifactApiResponse, ArtifactType } from "@/api/types";
import { ZoomableImage } from "@/components/ZoomableImage";
import { useCredentialGetter } from "@/hooks/useCredentialGetter";
import { useQuery } from "@tanstack/react-query";
import { ReloadIcon } from "@radix-ui/react-icons";
import { getImageURL } from "@/routes/tasks/detail/artifactUtils";

type Props = {
workflowRunBlockId: string;
};

function WorkflowRunBlockScreenshot({ workflowRunBlockId }: Props) {
const credentialGetter = useCredentialGetter();

const { data: artifacts, isLoading } = useQuery<Array<ArtifactApiResponse>>({
queryKey: ["workflowRunBlock", workflowRunBlockId, "artifacts"],
queryFn: async () => {
const client = await getClient(credentialGetter);
return client
.get(`/workflow_run_block/${workflowRunBlockId}/artifacts`)
.then((response) => response.data);
},
refetchInterval: (query) => {
const data = query.state.data;
const screenshot = data?.filter(
(artifact) => artifact.artifact_type === ArtifactType.LLMScreenshot,
)?.[0];
if (!screenshot) {
return 5000;
}
return false;
},
});

const llmScreenshots = artifacts?.filter(
(artifact) => artifact.artifact_type === ArtifactType.LLMScreenshot,
);

const screenshot = llmScreenshots?.[0];

if (isLoading) {
return (
<div className="flex h-full items-center justify-center gap-2 bg-slate-elevation1">
<ReloadIcon className="h-6 w-6 animate-spin" />
<div>Loading screenshot...</div>
</div>
);
}

if (!screenshot) {
return (
<div className="flex h-full items-center justify-center bg-slate-elevation1">
No screenshot found for this workflow run block.
</div>
);
}

return (
<figure className="mx-auto flex max-w-full flex-col items-center gap-2 overflow-hidden rounded">
<ZoomableImage src={getImageURL(screenshot)} alt="llm-screenshot" />
</figure>
);
}

export { WorkflowRunBlockScreenshot };
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import { DotFilledIcon } from "@radix-ui/react-icons";
import { WorkflowRunTimelineItemInfoSection } from "./WorkflowRunTimelineItemInfoSection";
import { ObserverThoughtScreenshot } from "./ObserverThoughtScreenshot";
import { ScrollArea, ScrollAreaViewport } from "@/components/ui/scroll-area";
import { WorkflowRunBlockScreenshot } from "./WorkflowRunBlockScreenshot";

export type ActionItem = {
block: WorkflowRunBlock;
Expand Down Expand Up @@ -103,9 +104,9 @@ function WorkflowRunOverview() {
/>
)}
{isWorkflowRunBlock(selection) && (
<div className="flex h-full w-full items-center justify-center bg-slate-elevation1">
No screenshot found for this block
</div>
<WorkflowRunBlockScreenshot
workflowRunBlockId={selection.workflow_run_block_id}
/>
)}
{isObserverThought(selection) && (
<ObserverThoughtScreenshot
Expand Down
1 change: 1 addition & 0 deletions skyvern/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ class Settings(BaseSettings):
SVG_MAX_LENGTH: int = 100000

ENABLE_LOG_ARTIFACTS: bool = False
STREAMING_FILE_BASE_PATH: str = "./temp"

def is_cloud_environment(self) -> bool:
"""
Expand Down
3 changes: 2 additions & 1 deletion skyvern/forge/prompts/skyvern/extract-action.j2
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ Reply in JSON format with the following keys:
"user_detail_query": str, // Think of this value as a Jeopardy question. Ask the user for the details you need for executing this action. Ask the question even if the details are disclosed in user goal or user details. If it's a text field, ask for the text. If it's a file upload, ask for the file. If it's a dropdown, ask for the relevant information. If you are clicking on something specific, ask about what to click on. If you're downloading a file and you have multiple options, ask the user which one to download. Otherwise, use null. Examples are: "What product ID should I input into the search bar?", "What file should I upload?", "What is the previous insurance provider of the user?", "Which invoice should I download?", "Does the user have any pets?". If the action doesn't require any user details, use null.
"user_detail_answer": str, // The answer to the `user_detail_query`. The source of this answer can be user goal or user details.
"confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
"action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE". "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the user goal has been achieved AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the user goal is achieved. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.
"action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "GOTO_PAGE", "COMPLETE", "TERMINATE". "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "GOTO_PAGE" is used when you want to redirect to other url but the current page does not have any link or action to do to that url."COMPLETE" is used when the user goal has been achieved AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the user goal is achieved. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.
"id": str, // The id of the element to take action on. The id has to be one from the elements list
"text": str, // Text for INPUT_TEXT action only
"file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise.
"url": str, // The url to go to if applicable. This field must be present for GOTO_PAGE action only. It should be null otherwise.
"download": bool, // Can only be true for CLICK actions. If true, the browser will trigger a download by clicking the element. If false, the browser will click the element without triggering a download.
"option": { // The option to select for SELECT_OPTION action only. null if not SELECT_OPTION action
"label": str, // the label of the option if any. MAKE SURE YOU USE THIS LABEL TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION LABEL HERE
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
The user is trying to achieve a goal the web. Now they've decided to go through a list of values and take the same tasks with each variant in the list.

Help to user extract this list of values based on what they want to achieve:
Help the user extract a list of values based on what they want to achieve:
```
{{ plan }}
```
```
22 changes: 22 additions & 0 deletions skyvern/forge/sdk/artifact/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from skyvern.forge.sdk.db.id import generate_artifact_id
from skyvern.forge.sdk.models import Step
from skyvern.forge.sdk.schemas.observers import ObserverCruise, ObserverThought
from skyvern.forge.sdk.schemas.workflow_runs import WorkflowRunBlock

LOG = structlog.get_logger(__name__)

Expand Down Expand Up @@ -151,6 +152,27 @@ async def create_observer_cruise_artifact(
path=path,
)

async def create_workflow_run_block_artifact(
self,
workflow_run_block: WorkflowRunBlock,
artifact_type: ArtifactType,
data: bytes | None = None,
path: str | None = None,
) -> str:
artifact_id = generate_artifact_id()
uri = app.STORAGE.build_workflow_run_block_uri(artifact_id, workflow_run_block, artifact_type)
return await self._create_artifact(
aio_task_primary_key=workflow_run_block.workflow_run_block_id,
artifact_id=artifact_id,
artifact_type=artifact_type,
uri=uri,
workflow_run_block_id=workflow_run_block.workflow_run_block_id,
workflow_run_id=workflow_run_block.workflow_run_id,
organization_id=workflow_run_block.organization_id,
data=data,
path=path,
)

async def create_llm_artifact(
self,
data: bytes,
Expand Down
7 changes: 7 additions & 0 deletions skyvern/forge/sdk/artifact/storage/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from skyvern.forge.sdk.artifact.models import Artifact, ArtifactType, LogEntityType
from skyvern.forge.sdk.models import Step
from skyvern.forge.sdk.schemas.observers import ObserverCruise, ObserverThought
from skyvern.forge.sdk.schemas.workflow_runs import WorkflowRunBlock

# TODO: This should be a part of the ArtifactType model
FILE_EXTENTSION_MAP: dict[ArtifactType, str] = {
Expand Down Expand Up @@ -52,6 +53,12 @@ def build_observer_cruise_uri(
) -> str:
pass

@abstractmethod
def build_workflow_run_block_uri(
self, artifact_id: str, workflow_run_block: WorkflowRunBlock, artifact_type: ArtifactType
) -> str:
pass

@abstractmethod
async def store_artifact(self, artifact: Artifact, data: bytes) -> None:
pass
Expand Down
7 changes: 7 additions & 0 deletions skyvern/forge/sdk/artifact/storage/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from skyvern.forge.sdk.artifact.storage.base import FILE_EXTENTSION_MAP, BaseStorage
from skyvern.forge.sdk.models import Step
from skyvern.forge.sdk.schemas.observers import ObserverCruise, ObserverThought
from skyvern.forge.sdk.schemas.workflow_runs import WorkflowRunBlock

LOG = structlog.get_logger()

Expand Down Expand Up @@ -40,6 +41,12 @@ def build_observer_cruise_uri(
file_ext = FILE_EXTENTSION_MAP[artifact_type]
return f"file://{self.artifact_path}/{settings.ENV}/observers/{observer_cruise.observer_cruise_id}/{datetime.utcnow().isoformat()}_{artifact_id}_{artifact_type}.{file_ext}"

def build_workflow_run_block_uri(
self, artifact_id: str, workflow_run_block: WorkflowRunBlock, artifact_type: ArtifactType
) -> str:
file_ext = FILE_EXTENTSION_MAP[artifact_type]
return f"file://{self.artifact_path}/{settings.ENV}/workflow_runs/{workflow_run_block.workflow_run_id}/{workflow_run_block.workflow_run_block_id}/{datetime.utcnow().isoformat()}_{artifact_id}_{artifact_type}.{file_ext}"

async def store_artifact(self, artifact: Artifact, data: bytes) -> None:
file_path = None
try:
Expand Down
7 changes: 7 additions & 0 deletions skyvern/forge/sdk/artifact/storage/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from skyvern.forge.sdk.artifact.storage.base import FILE_EXTENTSION_MAP, BaseStorage
from skyvern.forge.sdk.models import Step
from skyvern.forge.sdk.schemas.observers import ObserverCruise, ObserverThought
from skyvern.forge.sdk.schemas.workflow_runs import WorkflowRunBlock


class S3Storage(BaseStorage):
Expand Down Expand Up @@ -43,6 +44,12 @@ def build_observer_cruise_uri(
file_ext = FILE_EXTENTSION_MAP[artifact_type]
return f"s3://{self.bucket}/{settings.ENV}/observers/{observer_cruise.observer_cruise_id}/{datetime.utcnow().isoformat()}_{artifact_id}_{artifact_type}.{file_ext}"

def build_workflow_run_block_uri(
self, artifact_id: str, workflow_run_block: WorkflowRunBlock, artifact_type: ArtifactType
) -> str:
file_ext = FILE_EXTENTSION_MAP[artifact_type]
return f"s3://{self.bucket}/{settings.ENV}/workflow_runs/{workflow_run_block.workflow_run_id}/{workflow_run_block.workflow_run_block_id}/{datetime.utcnow().isoformat()}_{artifact_id}_{artifact_type}.{file_ext}"

async def store_artifact(self, artifact: Artifact, data: bytes) -> None:
await self.async_client.upload_file(artifact.uri, data)

Expand Down
Loading

0 comments on commit d3414c9

Please sign in to comment.