Skip to content

Commit

Permalink
Added support to scan google drive and google workplace drive
Browse files Browse the repository at this point in the history
  • Loading branch information
rohitcoder committed Dec 6, 2023
1 parent 40abbd0 commit 694368c
Show file tree
Hide file tree
Showing 6 changed files with 211 additions and 5 deletions.
14 changes: 13 additions & 1 deletion connection.yml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,19 @@ sources:
gdrive:
drive_example:
folder_name:
credentials_file: /Users/kumarohit/Downloads/client_secret.json
credentials_file: /Users/kumarohit/Downloads/client_secret.json ## this will be oauth app json file
cache: true
exclude_patterns:
- .pdf
- .docx

gdrive_workspace:
drive_example:
folder_name:
credentials_file: /Users/kumarohit/Downloads/client_secret.json ## this will be service account json file
impersonate_users:
- usera@amce.org
- userb@amce.org
cache: true
exclude_patterns:
- .pdf
Expand Down
5 changes: 5 additions & 0 deletions hawk_scanner/commands/gdrive.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
def connect_google_drive(credentials_file):
credentials = open(credentials_file, 'r').read()
credentials = json.loads(credentials)
## if installed key is in the credentials file, use it
if 'installed' in credentials:
credentials = credentials['installed']
client_id = credentials['client_id']
client_secret = credentials['client_secret']

Expand Down Expand Up @@ -82,6 +85,8 @@ def execute(args):
exclude_patterns = config.get(key, {}).get('exclude_patterns', [])
is_cache_enabled = config.get('cache', False)
drive = connect_google_drive(credentials_file)
if not os.path.exists("data/google_drive"):
os.makedirs("data/google_drive")
if drive:
files = list_files(drive, folder_name=folder_name)
for file_obj in files:
Expand Down
145 changes: 145 additions & 0 deletions hawk_scanner/commands/gdrive_workspace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import os
import json
from google.oauth2 import service_account
from googleapiclient.discovery import build
from hawk_scanner.internals import system

def connect_google_drive(credentials_file, impersonate_user=None):
credentials_json = open(credentials_file, 'r').read()
credentials_json = json.loads(credentials_json)
credentials = service_account.Credentials.from_service_account_file(
credentials_file,
scopes=['https://www.googleapis.com/auth/drive.readonly'],
)

if impersonate_user:
delegated_credentials = credentials.with_subject(impersonate_user)
credentials = delegated_credentials

try:
drive_service = build('drive', 'v3', credentials=credentials)
return drive_service
except Exception as e:
print(f"Failed to connect to Google Drive: {e}")

def download_file(drive, file_obj, base_path):
try:
file_name = file_obj['name']
file_id = file_obj['id']

folder_path = base_path
if 'parents' in file_obj:
for parent_id in file_obj['parents']:
parent_folder = drive.files().get(fileId=parent_id).execute()
if parent_folder['name'] == 'My Drive':
continue
folder_path = os.path.join(folder_path, parent_folder['name'])

file_path = os.path.join(folder_path, file_name)

if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.folder':
if not os.path.exists(file_path):
os.makedirs(file_path)
folder_files = drive.files().list(q=f"'{file_id}' in parents").execute().get('files', [])
for folder_file in folder_files:
download_file(drive, folder_file, folder_path)
else:
download_url = drive.files().get_media(fileId=file_id).execute()
with open(file_path, 'wb') as fh:
fh.write(download_url)

system.print_debug(f"File downloaded to: {file_path}")
except Exception as e:
print(f"Failed to download file: {e}")

def list_files(drive, impersonate_user=None):
try:
query = "'root' in parents"
if impersonate_user:
query += f" and '{impersonate_user}' in owners"
file_list = drive.files().list(q=query).execute().get('files', [])
return file_list
except Exception as e:
print(f"Error listing files: {e}")
return []

def execute(args):
results = []
connections = system.get_connection()
is_cache_enabled = False

if 'sources' in connections:
sources_config = connections['sources']
drive_config = sources_config.get('gdrive_workspace')
else:
system.print_error("No 'sources' section found in connection.yml")

if drive_config:
for key, config in drive_config.items():
credentials_file = config.get('credentials_file')
impersonate_users = config.get('impersonate_users', [])
exclude_patterns = config.get(key, {}).get('exclude_patterns', [])
is_cache_enabled = config.get('cache', False)

for impersonate_user in impersonate_users or [None]:
drive = connect_google_drive(credentials_file, impersonate_user)
if not os.path.exists("data/google_drive"):
os.makedirs("data/google_drive")
if drive:
files = list_files(drive, impersonate_user)
for file_obj in files:
download_file(drive, file_obj, "data/google_drive")
file_id = file_obj['id']
file_name = file_obj['name']
if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.folder':
continue

parent_folder_ids = file_obj.get('parents', [])
folder_path = "data/google_drive"
if parent_folder_ids:
for parent_id in parent_folder_ids:
parent_folder = drive.files().get(fileId=parent_id).execute()
if parent_folder['name'] == 'My Drive':
continue
folder_path = os.path.join(folder_path, parent_folder['name'])

file_path = os.path.join(folder_path, file_name)

if system.should_exclude_file(file_name, exclude_patterns):
continue

if config.get("cache") and os.path.exists(file_path):
is_cache_enabled = False
system.print_debug(f"File already exists in cache, using it.")
else:
is_cache_enabled = True

if is_cache_enabled:
download_file(drive, file_obj, "data/google_drive")

matches = system.read_match_strings(file_path, 'gdrive')
if matches:
for match in matches:
results.append({
'file_id': file_id,
'file_name': file_name,
'user': impersonate_user,
'file_path': file_path,
'pattern_name': match['pattern_name'],
'matches': match['matches'],
'sample_text': match['sample_text'],
'profile': key,
'data_source': 'gdrive_workspace'
})
else:
system.print_error("Failed to connect to Google Drive")
else:
system.print_error("No Google Drive connection details found in connection file")

if not is_cache_enabled:
os.system("rm -rf data/google_drive")

return results

# Call the execute function with the necessary arguments
# execute(y
34 changes: 33 additions & 1 deletion hawk_scanner/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def clear_screen():
console = Console()

## Now separate the results by data_source
data_sources = ['s3', 'mysql', 'redis', 'firebase', 'gcs', 'fs', 'postgresql', 'mongodb', 'slack', 'couchdb', 'gdrive']
data_sources = ['s3', 'mysql', 'redis', 'firebase', 'gcs', 'fs', 'postgresql', 'mongodb', 'slack', 'couchdb', 'gdrive', 'gdrive_workspace']

def load_command_module(command):
try:
Expand Down Expand Up @@ -101,6 +101,9 @@ def main():
table.add_column("Host > Database > Document ID > Field")
elif group == 'gdrive':
table.add_column("File Name")
elif group == 'gdrive_workspace':
table.add_column("File Name")
table.add_column("User")

table.add_column("Pattern Name")
table.add_column("Total Exposed")
Expand Down Expand Up @@ -418,6 +421,35 @@ def main():
exposed_values=records_mini
)

system.SlackNotify(AlertMsg)
elif group == 'gdrive_workspace':
table.add_row(
str(i),
result['profile'],
f"{result['file_name']}",
result['user'],
result['pattern_name'],
str(len(result['matches'])),
records_mini,
result['sample_text'],
)
AlertMsg = """
*** PII Or Secret Found ***
Data Source: Google Drive Workspace - {vulnerable_profile}
File Name: {file_name}
User: {user}
Pattern Name: {pattern_name}
Total Exposed: {total_exposed}
Exposed Values: {exposed_values}
""".format(
vulnerable_profile=result['profile'],
file_name=result['file_name'],
user=result['user'],
pattern_name=result['pattern_name'],
total_exposed=str(len(result['matches'])),
exposed_values=records_mini
)

system.SlackNotify(AlertMsg)
else:
# Handle other cases or do nothing for unsupported groups
Expand Down
16 changes: 14 additions & 2 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,22 @@ sources:
- venv
- node_modules

gdrive:
gdrive:
drive_example:
folder_name:
credentials_file: /Users/kumarohit/Downloads/client_secret.json
credentials_file: /Users/kumarohit/Downloads/client_secret.json ## this will be oauth app json file
cache: true
exclude_patterns:
- .pdf
- .docx

gdrive_workspace:
drive_example:
folder_name:
credentials_file: /Users/kumarohit/Downloads/client_secret.json ## this will be service account json file
impersonate_users:
- usera@amce.org
- userb@amce.org
cache: true
exclude_patterns:
- .pdf
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION = "0.3.5"
VERSION = "0.3.6"

from setuptools import setup, find_packages

Expand Down

0 comments on commit 694368c

Please sign in to comment.