Cybersecurity_Tools/PDF_Malware_Detection/predict.py

import subprocess
import json
import joblib
import os
# from pdf_extraction import extract_pdf_features

from pdfid import pdfid
import fitz
from os.path import exists
import sys

# Function to convert bytes to kilobytes
def bytes_to_kb(bytes):
    return bytes / 1024

# Main function to extract features from the PDF file
def extract_pdf_features(pdf_file):
    # Checking if the file exists
    if not exists(pdf_file):
        print(f"File {pdf_file} not found")
        return None

    features = {'FileName': pdf_file}

    # Open the PDF
    pdf = fitz.open(pdf_file)

    # Extract basic PDF metadata using PyMuPDF
    try:
        features['Pages'] = pdf.page_count
    except:
        features['Pages'] = -1

    try:
        features['XrefLength'] = pdf.xref_length()
    except:
        features['XrefLength'] = -1

    try:
        features['TitleCharacters'] = len(pdf.metadata.get('title', ''))
    except:
        features['TitleCharacters'] = -1

    features['isEncrypted'] = 1 if pdf.is_encrypted else 0

    # Extract image-related features
    images_count = 0
    for i in range(pdf.page_count):
        images_count += len(pdf.get_page_images(i))

    features['Images'] = images_count

    # Extract embedded file details
    emb_count = pdf.embfile_count()
    emb_size_sum = 0
    if emb_count != 0:
        try:
            for i in range(emb_count):
                emb_size_sum += pdf.embfile_info(i)
        except:
            features['EmbeddedFiles'] = -1
        else:
            features['EmbeddedFiles'] = emb_size_sum / emb_count
    else:
        features['EmbeddedFiles'] = 0

    # Extract presence of text in the PDF
    text = 0
    for page in pdf:
        if len(page.get_text().split()):
            text = 1
            break
    features['Text'] = text

    # Close the PDF after processing
    pdf.close()

    # Extract additional PDF features using pdfid
    try:
        options = pdfid.get_fake_options()
        options.scan = True
        options.json = True
        list_of_dict = pdfid.PDFiDMain([pdf_file], options)
        pdf_features = list_of_dict['reports'][0]
        del pdf_features['version']

        # Rename features to correspond to dataset names
        diff_in_feature_name = {
            'header': 'Header',
            'obj': 'Obj',
            'endobj': 'Endobj',
            'stream': 'Stream',
            'endstream': 'Endstream',
            'xref': 'Xref',
            'trailer': 'Trailer',
            'startxref': 'StartXref',
            '/Page': 'PageNo',
            '/Encrypt': 'Encrypt',
            '/ObjStm': 'ObjStm',
            '/JS': 'JS',
            '/JavaScript': 'JavaScript',
            '/AA': 'AA',
            '/OpenAction': 'OpenAction',
            '/AcroForm': 'AcroForm',
            '/JBIG2Decode': 'JBIG2Decode',
            '/RichMedia': 'RichMedia',
            '/Launch': 'Launch',
            '/EmbeddedFile': 'EmbeddedFile',
            '/XFA': 'XFA',
            '/Colors > 2^24': 'Colors'
        }

        for curr_name, new_name in diff_in_feature_name.items():
            pdf_features[new_name] = features.pop(curr_name, -1)

        features.update(pdf_features)
    except Exception as e:
        print(f"Error extracting pdfid features: {e}")
        features.update({
            'Header': '-1',
            'Obj': -1,
            'Endobj': -1,
            'Stream': -1,
            'Endstream': -1,
            'Xref': -1,
            'Trailer': -1,
            'StartXref': -1,
            'PageNo': -1,
            'Encrypt': -1,
            'ObjStm': -1,
            'JS': -1,
            'JavaScript': -1,
            'AA': -1,
            'OpenAction': -1,
            'AcroForm': -1,
            'JBIG2Decode': -1,
            'RichMedia': -1,
            'Launch': -1,
            'EmbeddedFile': -1,
            'XFA': -1,
            'Colors': -1
        })

    return features
# Function to extract features from the PDF file using pdf_feature_extraction.py
def extract_features(pdf_file):
    # command = f'python pdf_feature_extraction.py "{pdf_file}"'
    # result = subprocess.run(command, shell=True, capture_output=True, text=True)

    # if result.returncode != 0:
    #     raise ValueError(f"Error in feature extraction: {result.stderr}")

    # # Parse the output JSON string to a dictionary
    # features = json.loads(result.stdout)
    features = extract_pdf_features(pdf_file)
    return features

def header_to_numeric(header):
    if header.startswith('%PDF-'):
        return float(header.split('-')[1])  # Extract the version number
    return 0 

# Function to predict if the PDF contains malware
def predict_malware(pdf_file, model_path = os.path.join(os.path.dirname(__file__), 'saved_models', 'random_forest_model.pkl')):
    # Extract features
    features = extract_features(pdf_file)
    print(features)

    # Load pre-trained model (replace with the actual path of your model)
    model = joblib.load(model_path)

    # Select the required features for prediction
    feature_vector = [
        header_to_numeric(features.get('header', '')),
        features.get('obj',0),
        features.get('endobj',0),
        features.get('stream',0),
        features.get('endstream',0),
        features.get('xref',0),
        features.get('trailer',0),
        features.get('startxref',0),
        features.get('/Page', 0),
        features.get('/Encrypt', 0),
        features.get('ObjStm', 0),
        features.get('/JS',0),
        features.get('/JavaScript',0),
        features.get('/AA',0),
        features.get('/OpenAction',0),
        features.get('/AcroForm',0),
        features.get('/JBIG2Decode',0),
        features.get('/RichMedia',0),
        features.get('/Launch',0),
        features.get('/EmbeddedFile', 0),
        features.get('/XFA',0),
        features.get('/Colors',0),
        # Add more features as required by the model
    ]

    # Predict malware (assuming binary classification: 0 = benign, 1 = malicious)
    prediction = model.predict([feature_vector])

    if prediction[0] == 1:
        print("The PDF might be  Malicious.")
    else:
        print("The PDF is clean.")
    return prediction[0]

# Example usage
if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python pdf_feature_extraction.py <path/to/pdf_file>")
        sys.exit(1)
    
    # pdf_file = r"path/to/.pdf"  # Replace with the actual PDF file path
    # model_path = "malware_model.pkl"  # Replace with the actual path of the trained model
    predict_malware(sys.argv[1])