Skip to content

Commit

Permalink
'fisrt_push'
Browse files Browse the repository at this point in the history
  • Loading branch information
Nowow committed Jul 2, 2016
1 parent b8b30c2 commit f3f2a52
Show file tree
Hide file tree
Showing 9 changed files with 15,748 additions and 0 deletions.
1 change: 1 addition & 0 deletions syntax_model_files/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# word2vec_syntax
181 changes: 181 additions & 0 deletions syntax_model_files/gen_iter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
# -*- coding: utf-8 -*-
"""
Created on Tue May 24 22:43:11 2016
@author: robert
"""

#TWEAK TIME

import pickle
import os
import re
# use nltk.download() to download stopwords corpus if not yet




# Iterable to be passed to word2vec class as sentences.
# reads sentences one by one from picke dump



#stops = set(stopwords.words('russian'))

stops = ['чтоб', 'между', 'какой', 'без', 'но', 'чуть', 'для', 'не', 'куда',
'себя', 'всего', 'даже', 'был', 'кто', 'уж', 'только', 'с', 'быть',
'теперь', 'много', 'по', 'надо', 'когда', 'этого',
'три', 'и', 'опять', 'или', 'под', 'более', 'эти', 'бы', 'чем',
'совсем', 'сам', 'раз', 'хоть', 'нибудь', 'него', 'уже', 'сейчас',
'никогда', 'о', 'ни', 'можно', 'ли', 'потому', 'тем', 'будто',
'в', 'перед', 'так', 'два', 'ничего', 'а', 'почти', 'может',
'было', 'эту', 'их', 'нет', 'впрочем', 'им', 'во', 'лучше',
'до', 'про', 'вот', 'после', 'что', 'зачем', 'иногда',
'ее', 'другой', 'больше', 'тоже', 'еще', 'от', 'у', 'потом', 'всю',
'над', 'этой', 'за', 'если', 'ж', 'там', 'есть',
'через', 'из', 'как', 'на', 'чтобы', 'такой', 'том',
'да', 'этом', 'хорошо', 'к', 'при', 'были', 'себе',
'чего', 'ней', 'то', 'вам', 'один', 'вдруг', 'со',
'тогда', 'будет', 'разве', 'нельзя', 'наконец', 'ведь', 'здесь',
'тот', 'какая', 'этот', 'же', 'где', 'ну', 'конечно',
'того', 'тут', 'была', 'всегда', 'свою', 'об', 'всех']

# futureStops = ['кто', 'что']


# Looping over the corpus and generating pickle dump file that would give off
# context pairs one by one

def createContext(root_directory):

pickleDump = open('/run/media/robert/1TB-1/linuxfolder/pythonworks/contDumpFinal', 'ab')
dumpCounter = 0

# walking the corpus dir
# files walked linewise


for root, dirs, files in os.walk(root_directory):
for fname in filter(lambda fname: fname.endswith('.conll'), files):


document = open(os.path.join(root, fname), 'r')
print('Opened document ' + fname)


wordCounter = -1
sentDict = {}
sentCash = []
for line in document:

if len(line)<5:
continue
line = line.lower()
line = line.split()
# Creating cash dictionary for sentence

wordCounter += 1
if wordCounter < int(line[0]):

if re.match('[A-Za-zА-Яа-я]+$', line[2]) != None:
sentDict.update({line[0]:{'word':line[2],'ref':line[6]}})


else:
sentDict.update({line[0]:{'word':None,'ref':line[6]}})


else:
wordCounter = 0
# Creating a sentence (context pair) to be passed to word2vec later
for slot in sentDict:
if sentDict[slot]['word'] == None:
continue
if sentDict[slot]['word'] in stops:

continue
sentCash.append(sentDict[slot]['word']) # append target word if it is okay
# looking into word that's higher in hyerarchy
if (sentDict[slot]['ref'] != 0 and sentDict[slot]['ref'] != '0'):
wordRef = sentDict[slot]['ref']
refCounter = 0
while refCounter < 10:
refCounter += 1

#cycling through dependent word chain until good word fould or 10 tries


try:
if sentDict[wordRef]['word'] in stops:

wordRef = sentDict[wordRef]['ref']

else:
refCounter = 10

try:

sentCash.append(sentDict[sentDict[slot]['ref']]['word'])
# print(sentCash)

except:
continue
except:
pass
# looking into dependent words
# cycling through all words in a sentence again
for slot2 in sentDict:
if sentDict[slot2]['ref'] == slot:
if sentDict[slot2]['word'] != None:
if re.match('[A-Za-zА-Яа-я]+$', sentDict[slot2]['word']) != None:
if sentDict[slot2]['word'] not in stops:
sentCash.append(sentDict[slot2]['word'])
# if okay, stop here
#
if (sentDict[slot2]['word'] == None) or (sentDict[slot2]['word'] in stops):
checkedSlot = slot2
slotCounter = 0
while slotCounter < 10:
# print('SPASITE2')
slotCounter += 1
for slot3 in sentDict:
if sentDict[slot3]['ref'] == checkedSlot:

if (sentDict[slot3]['word'] == None) or (sentDict[slot3]['word'] in stops):
# print(str(sentDict[slot3]['word']) + ' is BAD WORD FROM SECOND CYCLE!')
checkedSlot = slot3
slotCounter += 1
else:
# print(sentDict[slot3]['word'] + ' is a GOOD WORD FROM SECOND CYCLE!')
sentCash.append(sentDict[slot3]['word'])
slotCounter = 10
# veryfying no stopwords slipped
for k in filter(lambda k: k in stops, sentCash):
sentCash.remove(k)
if len(sentCash) > 1:
# print('Dumping.....')
pickle.dump(sentCash,pickleDump)
#pickling to a file
dumpCounter += 1
sentCash = []
sentDict = {}
if re.match('[A-Za-zА-Яа-я]+$', line[2]) != None:
sentDict.update({line[0]:{'word':line[2],'ref':line[6]}})
else:
sentDict.update({line[0]:{'word':None,'ref':line[6]}})

pickleDump.close()
return(dumpCounter)













13 changes: 13 additions & 0 deletions syntax_model_files/gensim/models/compile_cython.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 12 21:01:49 2016
@author: robert
"""

from distutils.core import setup
from Cython.Build import cythonize
import numpy


setup(ext_modules=cythonize('/run/media/robert/1TB-1/linuxfolder/anaconda3/lib/python3.5/site-packages/gensim/models/word2vec_inner_synt2.pyx'), include_dirs=[numpy.get_include()])
Loading

0 comments on commit f3f2a52

Please sign in to comment.