Skip to content

Commit

Permalink
[[ feature ]] option for incremental storage of HPGs (#18)
Browse files Browse the repository at this point in the history
  • Loading branch information
SoheilKhodayari committed Dec 17, 2023
1 parent 3325d69 commit 01ca692
Show file tree
Hide file tree
Showing 14 changed files with 417 additions and 69 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@ $ screen -dmS s2 bash -c 'python3 -m run_pipeline --conf=conf2.yaml; exec sh'
$ # [...]
```

To generate parallel configuration files automatically, you may use the [`generate_config.py`](https://github.com/SoheilKhodayari/JAW/blob/master/input/generate_config.py) script.


#### How to Interpret the Output of the Analysis?
Expand Down
30 changes: 20 additions & 10 deletions analyses/cs_csrf/static_analysis.js
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ const dataStorageDirectory = pathModule.join(BASE_DIR, 'data');
// when true, nodejs will log the current step for each webpage to the console
const DEBUG = true;




var libraryHeuristics = []
const do_ast_preprocessing_passes = false;
var do_compress_graphs = true;
var overwrite_hpg = false;
var iterative_output = false;


/**
Expand All @@ -66,6 +66,7 @@ var libraryHeuristics = []
* ------------------------------------------------
**/

var libraryHeuristics = []

function isLibraryScript(scriptContent){
let flag = false;
Expand Down Expand Up @@ -152,7 +153,7 @@ function getOrCreateDataDirectoryForWebsite(url){
async function staticallyAnalyzeWebpage(url, webpageFolder){

let results_timing_file = pathModule.join(webpageFolder, "time.static_analysis.out");
if(fs.existsSync(results_timing_file)){
if(!overwrite_hpg && fs.existsSync(results_timing_file)){
DEBUG && console.log('[skipping] results already exists for: '+ webpageFolder)
return 1;
}
Expand Down Expand Up @@ -196,7 +197,7 @@ async function staticallyAnalyzeWebpage(url, webpageFolder){
let parsingErrors = [];
for(let [idx, script] of scripts.entries()){
let scriptName = script.name; // '' + idx + '.js';
let parsingError = await SourceSinkAnalyzerInstance.api.initializeModelsFromSource(scriptName, script.source, constantsModule.LANG.js, true)
let parsingError = await SourceSinkAnalyzerInstance.api.initializeModelsFromSource(scriptName, script.source, constantsModule.LANG.js, do_ast_preprocessing_passes)
if(parsingError && parsingError === scriptName){
parsingErrors.push(parsingError);
}
Expand All @@ -214,13 +215,19 @@ async function staticallyAnalyzeWebpage(url, webpageFolder){

const CsvHpgConstructionTimer = elapsed.start('csv_hpg_construction_timer');
DEBUG && console.log('[StaticAnalysis] started HPG export: IPCG/ERDDG/SemTypes.')
const graph = await SourceSinkAnalyzerInstance.api.buildHPG({ 'ipcg': true, 'erddg': true }); // IPCG, ERDDG + SemanticTypes + node/edge format
var graphBuilderOptions= { 'ipcg': true, 'erddg': true, 'output': webpageFolder, 'iterativeOutput': iterative_output };
const graph = await SourceSinkAnalyzerInstance.api.buildHPG(graphBuilderOptions); // IPCG, ERDDG + SemanticTypes + node/edge format
const graphid = hashURL(url);
GraphExporter.exportToCSV(graph, graphid, webpageFolder);
DEBUG && console.log('[StaticAnalysis] finished HPG export: IPCG/ERDDG/SemTypes.')
DEBUG && console.log('[StaticAnalysis] started compressing HPG.')
GraphExporter.compressGraph(webpageFolder);
DEBUG && console.log('[StaticAnalysis] finished compressing HPG.')


if(do_compress_graphs){
DEBUG && console.log('[StaticAnalysis] started compressing HPG.');
GraphExporter.compressGraph(webpageFolder);
DEBUG && console.log('[StaticAnalysis] finished compressing HPG.');
}

const CsvHpgConstructionTime = CsvHpgConstructionTimer.get();
CsvHpgConstructionTimer.end();

Expand Down Expand Up @@ -255,6 +262,9 @@ async function staticallyAnalyzeWebpage(url, webpageFolder){
const seedurl = config.seedurl;
const singleFolder = config.singlefolder;

overwrite_hpg = (config.overwritehpg && config.overwritehpg.toLowerCase() === 'true')? true: false;
do_compress_graphs = (config.compresshpg && config.compresshpg.toLowerCase() === 'false')? false: true;
iterative_output = (config.iterativeoutput && config.iterativeoutput.toLowerCase() === 'true')? true: false;

if(singleFolder && singleFolder.length > 10){

Expand Down
112 changes: 112 additions & 0 deletions analyses/cs_csrf/static_analysis_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# -*- coding: utf-8 -*-

"""
Copyright (C) 2022 Soheil Khodayari, CISPA
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Description:
------------
API for running the client-side CSRF preliminary analyses (i.e., property graph construction)
Usage:
------------
$ start_model_construction(website_url, memory, timeout)
"""



import os, sys, json
import utils.io as IOModule
import constants as constantsModule
import utils.utility as utilityModule
from utils.logging import logger as LOGGER



def start_model_construction(website_url, iterative_output='false', memory=None, timeout=None, compress_hpg='true', overwrite_hpg='false', specific_webpage=None):

# setup defaults
if memory is None:
static_analysis_memory = '32000'
else:
static_analysis_memory = memory

if timeout is None:
static_analysis_per_webpage_timeout = 600 # seconds
else:
static_analysis_per_webpage_timeout = timeout


cs_csrf_analyses_command_cwd = os.path.join(constantsModule.BASE_DIR, "analyses/cs_csrf")
cs_csrf_static_analysis_driver_program = os.path.join(cs_csrf_analyses_command_cwd, "static_analysis.js")

cs_csrf_static_analysis_command = "node --max-old-space-size=%s DRIVER_ENTRY --singlefolder=SINGLE_FOLDER --compresshpg=%s --overwritehpg=%s --iterativeoutput=%s"%(static_analysis_memory, compress_hpg, overwrite_hpg, iterative_output)
cs_csrf_static_analysis_command = cs_csrf_static_analysis_command.replace("DRIVER_ENTRY", cs_csrf_static_analysis_driver_program)


website_folder_name = utilityModule.getDirectoryNameFromURL(website_url)
website_folder = os.path.join(constantsModule.DATA_DIR, website_folder_name)

webpages_json_file = os.path.join(website_folder, 'webpages.json')
urls_file = os.path.join(website_folder, 'urls.out')


if specific_webpage is not None:
webpage_folder = os.path.join(constantsModule.DATA_DIR, specific_webpage)
if os.path.exists(webpage_folder):
node_command= cs_csrf_static_analysis_command.replace('SINGLE_FOLDER', webpage_folder)
IOModule.run_os_command(node_command, cwd=cs_csrf_analyses_command_cwd, timeout=static_analysis_per_webpage_timeout, print_stdout=True, log_command=True)

elif os.path.exists(webpages_json_file):

fd = open(webpages_json_file, 'r')
webpages = json.load(fd)
fd.close()

for webpage in webpages:
webpage_folder = os.path.join(website_folder, webpage)
if os.path.exists(webpage_folder):

node_command= cs_csrf_static_analysis_command.replace('SINGLE_FOLDER', webpage_folder)
IOModule.run_os_command(node_command, cwd=cs_csrf_analyses_command_cwd, timeout=static_analysis_per_webpage_timeout, print_stdout=True, log_command=True)



elif os.path.exists(urls_file):
message = 'webpages.json file does not exist, falling back to urls.out'
LOGGER.warning(message)

# read the urls from the webpage data
fd = open(urls_file, 'r')
urls = fd.readlines()
fd.close()

# make sure that the list of urls is unique
# this would eliminate the cases where the crawler is executed multiple times for the same site
# without deleting the data of the old crawl and thus adds duplicate urls to urls.out file.
urls = list(set(urls))

for url in urls:
url = url.strip().rstrip('\n').strip()
webpage_folder_name = utilityModule.sha256(url)
webpage_folder = os.path.join(website_folder, webpage_folder_name)
if os.path.exists(webpage_folder):
node_command= cs_csrf_static_analysis_command.replace('SINGLE_FOLDER', webpage_folder)
IOModule.run_os_command(node_command, cwd=cs_csrf_analyses_command_cwd, timeout=static_analysis_per_webpage_timeout, print_stdout=True, log_command=True)

else:
message = 'no webpages.json or urls.out file exists in the webapp directory; skipping analysis...'
LOGGER.warning(message)

30 changes: 23 additions & 7 deletions analyses/domclobbering/static_analysis.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ const DOMClobberingSourceSinkAnalyzer = DOMClobberingSourceSinkAnalyzerModule.DO
const DOMClobberingPayloadGeneratorModule = require('./domc_payload_generator.js');
const DOMClobberingPayloadGenerator = DOMClobberingPayloadGeneratorModule.DOMClobberingPayloadGenerator;

const constantsModule = require('./../../engine/lib/jaw/constants');
const GraphExporter = require('./../../engine/core/io/graphexporter');

/**
Expand All @@ -55,8 +56,11 @@ const dataStorageDirectory = pathModule.join(BASE_DIR, 'data');
// when true, nodejs will log the current step for each webpage to the console
const DEBUG = true;

const do_ast_preprocessing_passes = false;
var do_compress_graphs = true;
var overwrite_hpg = false;
var iterative_output = false;

var libraryHeuristics = []


/**
Expand All @@ -65,6 +69,7 @@ var libraryHeuristics = []
* ------------------------------------------------
**/

var libraryHeuristics = []

function isLibraryScript(scriptContent){
let flag = false;
Expand Down Expand Up @@ -151,7 +156,7 @@ function getOrCreateDataDirectoryForWebsite(url){
async function staticallyAnalyzeWebpage(url, webpageFolder){

let results_timing_file = pathModule.join(webpageFolder, "time.static_analysis.out");
if(fs.existsSync(results_timing_file)){
if(!overwrite_hpg && fs.existsSync(results_timing_file)){
DEBUG && console.log('[skipping] results already exists for: '+ webpageFolder)
return 1;
}
Expand Down Expand Up @@ -196,7 +201,7 @@ async function staticallyAnalyzeWebpage(url, webpageFolder){
let parsingErrors = [];
for(let [idx, script] of scripts.entries()){
let scriptName = script.name; // '' + idx + '.js';
let parsingError = await domcSourceSinkAnalyzer.domcModelBuilder.initializeModelsFromSource(scriptName, script.source)
let parsingError = await domcSourceSinkAnalyzer.domcModelBuilder.initializeModelsFromSource(scriptName, script.source, constantsModule.LANG.js, do_ast_preprocessing_passes);
if(parsingError && parsingError === scriptName){
parsingErrors.push(parsingError);
}
Expand Down Expand Up @@ -273,13 +278,20 @@ async function staticallyAnalyzeWebpage(url, webpageFolder){

const CsvHpgConstructionTimer = elapsed.start('csv_hpg_construction_timer');
DEBUG && console.log('[StaticAnalysis] started HPG export: IPCG/ERDDG/SemTypes.')
const graph = await domcSourceSinkAnalyzer.domcModelBuilder.buildHPG({ 'ipcg': true, 'erddg': true }); // IPCG, ERDDG + SemanticTypes + node/edge format

var graphBuilderOptions= { 'ipcg': true, 'erddg': true, 'output': webpageFolder, 'iterativeOutput': iterative_output };
const graph = await domcSourceSinkAnalyzer.domcModelBuilder.buildHPG(graphBuilderOptions); // IPCG, ERDDG + SemanticTypes + node/edge format
const graphid = hashURL(url);
GraphExporter.exportToCSV(graph, graphid, webpageFolder);
DEBUG && console.log('[StaticAnalysis] finished HPG export: IPCG/ERDDG/SemTypes.')
DEBUG && console.log('[StaticAnalysis] started compressing HPG.')
GraphExporter.compressGraph(webpageFolder);
DEBUG && console.log('[StaticAnalysis] finished compressing HPG.')


if(do_compress_graphs){
DEBUG && console.log('[StaticAnalysis] started compressing HPG.');
GraphExporter.compressGraph(webpageFolder);
DEBUG && console.log('[StaticAnalysis] finished compressing HPG.');
}

const CsvHpgConstructionTime = CsvHpgConstructionTimer.get();
CsvHpgConstructionTimer.end();

Expand Down Expand Up @@ -316,6 +328,10 @@ async function staticallyAnalyzeWebpage(url, webpageFolder){
const seedurl = config.seedurl;
const singleFolder = config.singlefolder;

overwrite_hpg = (config.overwritehpg && config.overwritehpg.toLowerCase() === 'true')? true: false;
do_compress_graphs = (config.compresshpg && config.compresshpg.toLowerCase() === 'false')? false: true;
iterative_output = (config.iterativeoutput && config.iterativeoutput.toLowerCase() === 'true')? true: false;


if(singleFolder && singleFolder.length > 10){

Expand Down
112 changes: 112 additions & 0 deletions analyses/domclobbering/static_analysis_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# -*- coding: utf-8 -*-

"""
Copyright (C) 2022 Soheil Khodayari, CISPA
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Description:
------------
API for running the DOM Clobbering preliminary analyses (i.e., property graph construction and identifying sinks)
Usage:
------------
$ start_model_construction(website_url, memory, timeout)
"""



import os, sys, json
import utils.io as IOModule
import constants as constantsModule
import utils.utility as utilityModule
from utils.logging import logger as LOGGER



def start_model_construction(website_url, iterative_output='false', memory=None, timeout=None, compress_hpg='true', overwrite_hpg='false', specific_webpage=None):

# setup defaults
if memory is None:
static_analysis_memory = '32000'
else:
static_analysis_memory = memory

if timeout is None:
static_analysis_per_webpage_timeout = 600 # seconds
else:
static_analysis_per_webpage_timeout = timeout


domclobbering_analyses_command_cwd = os.path.join(constantsModule.BASE_DIR, "analyses/domclobbering")
domclobbering_static_analysis_driver_program = os.path.join(domclobbering_analyses_command_cwd, "static_analysis.js")

domclobbering_static_analysis_command = "node --max-old-space-size=%s DRIVER_ENTRY --singlefolder=SINGLE_FOLDER --compresshpg=%s --overwritehpg=%s --iterativeoutput=%s"%(static_analysis_memory, compress_hpg, overwrite_hpg, iterative_output)
domclobbering_static_analysis_command = domclobbering_static_analysis_command.replace("DRIVER_ENTRY", domclobbering_static_analysis_driver_program)


website_folder_name = utilityModule.getDirectoryNameFromURL(website_url)
website_folder = os.path.join(constantsModule.DATA_DIR, website_folder_name)

webpages_json_file = os.path.join(website_folder, 'webpages.json')
urls_file = os.path.join(website_folder, 'urls.out')


if specific_webpage is not None:
webpage_folder = os.path.join(constantsModule.DATA_DIR, specific_webpage)
if os.path.exists(webpage_folder):
node_command= domclobbering_static_analysis_command.replace('SINGLE_FOLDER', webpage_folder)
IOModule.run_os_command(node_command, cwd=domclobbering_analyses_command_cwd, timeout=static_analysis_per_webpage_timeout, print_stdout=True, log_command=True)

elif os.path.exists(webpages_json_file):

fd = open(webpages_json_file, 'r')
webpages = json.load(fd)
fd.close()

for webpage in webpages:
webpage_folder = os.path.join(website_folder, webpage)
if os.path.exists(webpage_folder):

node_command= domclobbering_static_analysis_command.replace('SINGLE_FOLDER', webpage_folder)
IOModule.run_os_command(node_command, cwd=domclobbering_analyses_command_cwd, timeout=static_analysis_per_webpage_timeout, print_stdout=True, log_command=True)



elif os.path.exists(urls_file):
message = 'webpages.json file does not exist, falling back to urls.out'
LOGGER.warning(message)

# read the urls from the webpage data
fd = open(urls_file, 'r')
urls = fd.readlines()
fd.close()

# make sure that the list of urls is unique
# this would eliminate the cases where the crawler is executed multiple times for the same site
# without deleting the data of the old crawl and thus adds duplicate urls to urls.out file.
urls = list(set(urls))

for url in urls:
url = url.strip().rstrip('\n').strip()
webpage_folder_name = utilityModule.sha256(url)
webpage_folder = os.path.join(website_folder, webpage_folder_name)
if os.path.exists(webpage_folder):
node_command= domclobbering_static_analysis_command.replace('SINGLE_FOLDER', webpage_folder)
IOModule.run_os_command(node_command, cwd=domclobbering_analyses_command_cwd, timeout=static_analysis_per_webpage_timeout, print_stdout=True, log_command=True)

else:
message = 'no webpages.json or urls.out file exists in the webapp directory; skipping analysis...'
LOGGER.warning(message)

Loading

0 comments on commit 01ca692

Please sign in to comment.