# 3. Analysis
## 3.0. Functions to use in analysis

In [1]:
def find_tokens(df, token_dict):
 """
 Creates indicator for each row
 whether it contains specified tokens
 
 Args: 
 df - analyzed dataframe
 token_dict - dictionary with specified tokens
 keys - column names
 values - a list of tokens
 E.g. to find news about Ukraine:
 token_dict = {'tags_bottom': ['Украин', 'Киев'],
 'body_token_final': ['украин', 'киев'],
 'title_token_final': ['украин', 'киев']}
 Returns:
 df - dataframe with added columns that indicate whether a news story
 contains specified tokens
 """
 new_columns = []
 for key in token_dict:
 searched_pattern = ('|').join(token_dict[key])
 
 # Add a column-indicator
 new_col_name = key+'_contain'
 new_columns.append(new_col_name)
 df[new_col_name] = np.where(df[key].str.contains(searched_pattern)==True, 1, 0)
 
 # If at least one of the columns contains tokens, we can say that 
 # the news story contains tokens
 df['contain_token'] = np.minimum(df[new_columns].sum(axis=1), 1)
 
 return df 

In [2]:
def most_frequent(ser, n):
 """
 Finds the most frequent n words 
 in the tokenized column
 
 Args: ser - series (from a tokenized column)
 n - a number of the most common words
 Rerurns: a Counter object with the most common words
 (a list of tuples (word, frequency))
 
 """
 return Counter(" ".join(ser.str.replace('[\[\',\]]', '', regex=True)).split()).most_common(n) 

In [3]:
def find_most_frequent(df, col, n, token_dict=None, groupby=None):
 """
 Finds the n words that appear the most frequently in the 
 column "col".
 If "token_dict" is defined - finds the most frequent words
 only in rows that contain specified tokens.
 
 Args: *df - a dataframe with scraped data 
 (must be tokenized);
 *col - string, a name of the tkenized column where the 
 most frequent words need to be found;
 *n - integer, a number of the most common words;
 *token_dict - dictionary with specified tokens; 
 *groupby - string, a name of the column whose values are 
 utilized for grouping. In order to find the most 
 common words for each year, use groupby='year'. 
 
 Returns: df_frequent - a dataframe containing Counter objects. 
 """
 
 # Find rows that contain tokens, specified in token_dict
 # Filter these rows
 if token_dict != None:
 df = find_tokens(df, token_dict)
 df = df[(df['contain_token']==1)]
 
 # Group by specified col
 if groupby != None:
 df_frequent = df.groupby(groupby).agg({col: lambda x: most_frequent(x, n)})
 else:
 df_frequent = most_frequent(df[col], n)
 
 return df_frequent

In [4]:
def delete_words_from_list(frequent_words, words_to_delete):
 """
 Deletes words from the list of tuples (word, frequency)
 Args: 
 *frequent_words - *list of tuples (word, frequency),
 from which stop words must be removed;
 *words_to_delete - list of stop words.
 
 Returns: 
 *frequent_words - dictionary {word: frequency} without 
 words specified in words_to_delete.
 
 """
 frequent_words = dict(frequent_words)
 
 for key in words_to_delete:
 if key in frequent_words:
 del frequent_words[key]
 
 return frequent_words 

In [5]:
def delete_stop_words(frequent_words, words_to_delete):
 """
 Deletes words from the list of tuples (word, frequency)
 or from dataframe
 Args: 
 *frequent_words - *list of tuples (word, frequency),
 from which stop words must be removed;
 Or
 *dataframe that have a column with 
 list of tuples (word, frequency).
 *words_to_delete - list of stop words
 
 Returns: 
 *frequent_words - *dictionary {word: frequency} without 
 words specified in words_to_delete
 Or
 *dataframe that have a column with 
 dictionaries {word: frequency} 
 """
 
 if type(frequent_words)==list:
 frequent_words = delete_words_from_list(frequent_words, words_to_delete)
 else:
 frequent_words['frequent words'] = \
 frequent_words.iloc[:, 0].apply(delete_words_from_list, words_to_delete=words_to_delete)
 
 return frequent_words 

In [6]:
def merge_dict_keys(dict_freq, lists_for_merge):
 """
 Merges all keys from the list into a single key. 
 The values associated with these keys are added together.
 
 Args:
 *dict_freq - dictionary {word: frequency}
 *lists_for_merge - list of lists; each list contains 
 keys that should be merged
 Returns:
 *dict_freq - dictionary mith merged keys
 """
 
 # Make a copy of lists_for_merge so we don't change it
 lists = copy.deepcopy(lists_for_merge)
 
 for word_lst in lists:
 
 # Make sure dict_freq contains elements with keys from 
 # lists_for_merge. If dict_freq does not contain such an 
 # element, we should remove it from the list.
 for key in word_lst.copy():
 if key not in dict_freq:
 word_lst.remove(key)
 
 # Merge keys in the dict_freq
 if len(word_lst) > 1:
 for key in word_lst[1:]:
 dict_freq[word_lst[0]] += dict_freq[key]
 del dict_freq[key] 
 
 return dict_freq 

In [7]:
def merge_words(frequent_words, lists_for_merge):
 """
 Merges words from the list (with similar meaning) into one word 
 
 Args: 
 *frequent_words - *dict {word, frequency},
 in which keys should be merged;
 Or
 *dataframe that have a column with 
 dictionaries {word, frequency}.
 *lists_for_merge - list of lists; each list contains 
 keys that should be merged
 
 Returns: 
 *frequent_words - *dictionary with merged keys
 Or
 *dataframe that have a column with 
 dictionaries (with merged keys) 
 """
 
 if type(frequent_words)==dict:
 frequent_words = merge_dict_keys(frequent_words, lists_for_merge)
 else:
 frequent_words['frequent words'] = \
 frequent_words.iloc[:, 1].apply(merge_dict_keys, lists_for_merge=lists_for_merge)
 
 return frequent_words 

In [8]:
def translate_ru_en(dict_ru, dict_ru_en):
 """
 Translates keys in the input dictionary from RU to EN
 
 Args: dict_ru - dictionary with keys in Russian
 dict_ru_en - RU-EN dictionary with better translation 
 than automatic
 Returns: translation - dataframe with translation
 """
 keys_rus = list(dict_ru.keys())
 
 # Translate keys
 keys_eng = []
 for key in keys_rus:
 
 try:
 keys_eng.append(tss.google(key, 'ru', 'en'))
 except:
 keys_eng.append(" ")
 
 
 
 # Transform the dictionary into a dataframe
 # Index here is a word in Russian
 translation = pd.DataFrame.from_dict(dict_ru, 
 orient='index',
 columns=['frequency'])
 translation['eng'] = keys_eng
 
 
 # Apply dictionary dict_ru_en 
 dict_ru_en_1 = dict_ru_en.copy()
 
 # Find keys in dict_ru_en_1 that are not present in translation.index
 index_set = set(translation.index)
 keys_set = set(dict_ru_en_1)
 keys_to_delete = keys_set.difference(index_set)
 
 # Delete these keys
 for key in dict_ru_en_1.copy():
 if key in keys_to_delete:
 del dict_ru_en_1[key]
 
 # Apply dictionary for remaining keys
 for key in dict_ru_en_1:
 translation.loc[key, 'eng'] = dict_ru_en_1[key]
 
 # Delete articles "a", "the" from translation
 translation[['article', 'wt article']] = \
 translation['eng'].str.extract('(^the |^The |^a |^an |^A |^An |^to |^To | the$)(.*)')
 
 translation['eng'] = np.where(translation['wt article'].isnull(), 
 translation['eng'],
 translation['wt article']) 
 
 
 return translation[['eng', 'frequency']] 

In [9]:
def translate_df_of_dict(df, dict_ru_en):
 """
 Translates df with dictionaries 
 """
 # Add all the dictionaries, so we don't translate a word twice
 sum_counter = Counter(df.iloc[0, 1])
 
 for i in range(1, len(df)):
 sum_counter += Counter(df.iloc[i, 1])
 
 translate = translate_ru_en(dict(sum_counter), dict_ru_en) 
 
 return translate