-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvision.py
233 lines (191 loc) · 7.79 KB
/
vision.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
from openai import OpenAI
import pyautogui
import os
import sys
import base64
import requests
import subprocess
# Path to your image
TEMP_SCREENSHOT_PATH = "temp.png"
def encode_image(image_path):
"""Encodes the image from the specified path into a base64 string.
Parameters:
- image_path (str): The path to the image file.
Returns:
- encoded_image (str): The base64 encoded image string.
"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def take_screenshot(image_path=TEMP_SCREENSHOT_PATH):
"""Takes a screenshot of the screen and saves it as a downscaled image.
Args:
image_path (str, optional): The path where the downscaled screenshot image will be saved.
Defaults to TEMP_SCREENSHOT_PATH.
Returns:
tuple: A tuple containing the downscaled image and the size of the original screen.
"""
screenshot = pyautogui.screenshot()
scale = 4
downsampled_image = screenshot.resize(
(screenshot.width // scale, screenshot.height // scale))
print(downsampled_image)
screen_size = screenshot.size
print(screen_size)
# Save the screenshot as "temp.jpg" in the current directory
downsampled_image.save(image_path)
return downsampled_image, screen_size
def is_retina():
"""Check if the screen is retina."""
if sys.platform != 'win32':
return subprocess.call("system_profiler SPDisplaysDataType | grep 'Retina'", shell= True) == 0
else:
return False
def crop_image(image, xmin, ymin, xmax, ymax):
"""Crop an image based on given bounding box coordinates.
Args:
image (PIL.Image.Image): The input image to be cropped.
xmin (float): The normalized minimum x-coordinate of the bounding box.
ymin (float): The normalized minimum y-coordinate of the bounding box.
xmax (float): The normalized maximum x-coordinate of the bounding box.
ymax (float): The normalized maximum y-coordinate of the bounding box.
Returns:
PIL.Image.Image: The cropped image.
Note: The coordinates should be normalized between 0 and 1, where (0, 0) represents the top left corner
of the image and (1, 1) represents the bottom right corner of the image.
"""
# Get the width and height of the image
width, height = image.size
# Calculate the pixel coordinates
xmin_pixel = int(xmin * width)
ymin_pixel = int(ymin * height)
xmax_pixel = int(xmax * width)
ymax_pixel = int(ymax * height)
# Crop the image
cropped_image = image.crop((xmin_pixel, ymin_pixel, xmax_pixel, ymax_pixel))
return cropped_image
def move_to_block(x, y, xmin, ymin, xmax, ymax):
"""Moves the mouse cursor to a specific location on the screen and shrink the area.
Parameters:
x (float): The x-coordinate of the target location, relative to the minimum and maximum x-values provided.
y (float): The y-coordinate of the target location, relative to the minimum and maximum y-values provided.
xmin (float): The minimum x-value of the bounding box.
ymin (float): The minimum y-value of the bounding box.
xmax (float): The maximum x-value of the bounding box.
ymax (float): The maximum y-value of the bounding box.
Returns:
(float, float, float, float): A tuple representing the coordinates for cropping the image. The tuple contains the
minimum x-value, minimum y-value, maximum x-value, and maximum y-value for cropping.
Example:
crop_xmin, crop_ymin, crop_xmax, crop_ymax = move_to_block(0.3, 0.8, 0, 0, 1, 1)
# The mouse cursor will move to the (0.3, 0.8) location on the screen.
# The returned cropping coordinates will be 1/4 area of (0, 0, 1, 1).
"""
x = xmin + (xmax - xmin) * x
y = ymin + (ymax - ymin) * y
xcenter = (xmin + xmax) / 2.0
ycenter = (ymin + ymax) / 2.0
crop_xmin, crop_ymin, crop_xmax, crop_ymax = 0, 0, 1, 1
if x < xcenter:
crop_xmax = 0.5
else:
crop_xmin = 0.5
if y < ycenter:
crop_ymax = 0.5
else:
crop_ymin = 0.5
print(f"moving mouse to ({x}, {y})")
pyautogui.moveTo(x, y, 1, pyautogui.easeOutQuad)
return crop_xmin, crop_ymin, crop_xmax, crop_ymax
def ask(concept: str, api_key: str):
"""Find a concept on the screen and move the mouse to click it.
Takes a concept as input and performs sequential localization on a screenshot to determine the location of the concept
on the screen.
Parameters:
concept (str): The concept to be localized on the screen.
"""
image_path = TEMP_SCREENSHOT_PATH
screen, screen_size = take_screenshot(image_path=image_path)
width, height = screen_size
if is_retina():
width /= 2
height /= 2
screen_xmin = 0
screen_ymin = 0
screen_xmax = width
screen_ymax = height
for _ in range(3):
# Sequential localization.
query = f"Where is `{concept}`? Share the x_min, y_min, x_max, y_max in 0-1 normalized space. Only return the numbers, nothing else."
response = ask_gpt(query, api_key, image_path=image_path)
if 'choices' not in response:
# Stop.
return response
message = response['choices'][0]['message']
role = message['role']
content = message['content']
try:
xmin, ymin, xmax, ymax = tuple(map(float, content.split(',')))
x = (xmin+xmax) / 2.0
y = (ymin+ymax) / 2.0
crop_xmin, crop_ymin, crop_xmax, crop_ymax = move_to_block(x, y, screen_xmin, screen_ymin, screen_xmax, screen_ymax)
# Refine the bbox.
screen = crop_image(screen, crop_xmin, crop_ymin, crop_xmax, crop_ymax)
screen.save(image_path)
new_xmin = screen_xmin + crop_xmin * (screen_xmax - screen_xmin)
new_xmax = screen_xmin + crop_xmax * (screen_xmax - screen_xmin)
new_ymin = screen_ymin + crop_ymin * (screen_ymax - screen_ymin)
new_ymax = screen_ymin + crop_ymax * (screen_ymax - screen_ymin)
screen_xmin, screen_xmax, screen_ymin, screen_ymax = new_xmin, new_xmax, new_ymin, new_ymax
except:
print(f"Failed: {content}")
if screen_xmin !=0 and screen_ymin != 0:
pyautogui.click()
return f"Clicked ({x}, {y})"
else:
return content
def ask_gpt(query: str, api_key: str, image_path=TEMP_SCREENSHOT_PATH):
"""Use GPT-4 Vision API to ask a question based on an image.
Parameters:
query (str): The question/query to ask based on the image.
image_path (str, optional): The path to the image file to be analyzed. Defaults to TEMP_SCREENSHOT_PATH.
Returns:
str: The generated response/answer from the GPT-4 Vision API.
Raises:
None
Examples:
>>> ask_gpt("What is this object?", "{your_openai_api_key}", "image.png")
"This object is a cat."
"""
# Getting the base64 string
base64_image = encode_image(image_path)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model":
"gpt-4-vision-preview",
"messages": [{
"role":
"user",
"content": [
{
"type": "text",
"text": query
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}],
"max_tokens":
300
}
response = requests.post("https://api.openai.com/v1/chat/completions",
headers=headers,
json=payload)
# TODO potential RequestsJSONDecodeError
return response.json()