-
Notifications
You must be signed in to change notification settings - Fork 3
/
code2picture.py
executable file
·107 lines (102 loc) · 3.5 KB
/
code2picture.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
import os, sys
import os.path
from sklearn.feature_extraction.text import TfidfTransformer
import re
from simhash import Simhash
from PIL import Image
from time import time
bening_code_okay_to = ("C:\\Users\\islab718A\\Desktop\\malware\\hhj")
store_benefit_jpg = ("C:\\Users\\islab718A\\Desktop\\malware\\test_image")
corpus = []
############################################################################## get r, g, b, x, y
def hash_djb2_generate_r_g_b(string):
hash = 5381
for x in string:
hash = (( hash << 5) + hash) + ord(x)
return (hash & 0xFF0000)>>16, (hash & 0x00FF00)>>8, hash & 0x0000FF
def simhash(s): ############################################################## get x, y
width = 3
s = s.lower()
s = re.sub(r'[^\w]+', '', s)
return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]
def after_SimHash(n):
a = [int(digit) for digit in bin(n)[2:]]
k = []
while len(a)<=69:
a.insert(0, 0)
for i in range(0, len(a), 7):
k.append(a[i]^a[i+1]^a[i+2]^a[i+3]^a[i+4]^a[i+5]^a[i+6])
out1 = 0
out2 = 0
o = 0
k1 = []
k2 = []
for bit in k:
if o <= 4:
k1.append(bit)
else:
k2.append(bit)
o = o + 1
for bitt in k1:
out1 = (out1 << 1) | bitt
for bittt in k2:
out2 = (out2 << 1) | bittt
return out1, out2
def get_x_y(string):
return after_SimHash(Simhash(simhash(string)).value)
############################################################################## tf-idf part(get sequence of word)
for foldername in os.listdir(bening_code_okay_to):
with open(bening_code_okay_to+"\\"+foldername, 'r') as myfile:
data=myfile.read().replace('\n', '')
corpus.append(data)#if have to
print "preparing.."
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
word = vectorizer.get_feature_names() ## unicode list
print "total word is : "+str(len(word))
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
a = []
to_sim_list = []
for k in range(len(corpus)): #tfidf.toarray()[0]tfidf.toarray()[k]len(corpus)
print "now handling : "+str(k)
if os.path.exists(store_benefit_jpg+"\\"+str(k+250)+".jpg"):
continue
else:
aa = time()
for t in range(0, 9600, 120):
if t < 120:
a = np.array(word)[tfidf.toarray()[k].argsort()[-(t+120):][::-1]].tolist()
to_sim_list.append(' '.join(a))
del a[:]
else:
#a.append(word[tfidf.toarray()[0].argsort()[-(t+20):-t][::-1][i]])
a = np.array(word)[tfidf.toarray()[k].argsort()[-(t+120):-t][::-1]].tolist()
to_sim_list.append(' '.join(a))
del a[:]
##############################################################################generate jpg
img = Image.new( 'RGB', (32,32), "black")
pixels = img.load()
for i in range(len(to_sim_list)):
to_sim_list[i]
r, g, b = hash_djb2_generate_r_g_b(to_sim_list[i])
x, y = get_x_y(to_sim_list[i])
for j in range(-2, 3):
for yt in range(-2, 3):
rq = x+yt
h = y+j
if rq < 0 or h < 0 or rq > 31 or h > 31:
continue
else:
if pixels[rq ,h][0] == 0 and pixels[rq ,h][1] == 0 and pixels[rq ,h][2] == 0:
pixels[rq ,h] = (r, g, b)
else:
continue
img.save(store_benefit_jpg+"\\"+str(k+250)+".jpg")
##############################################################################
del to_sim_list[:]
bb = time()
print "handle time : "+str(bb - aa)