{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d7b6994b",
   "metadata": {},
   "source": [
    "# 3. Analysis\n",
    "## 3.0. Functions to use in analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f91135b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_tokens(df, token_dict):\n",
    "    \"\"\"\n",
    "    Creates indicator for each row\n",
    "    whether it contains specified tokens\n",
    "    \n",
    "    Args: \n",
    "    df - analyzed dataframe\n",
    "    token_dict - dictionary with specified tokens\n",
    "                 keys - column names\n",
    "                 values - a list of tokens\n",
    "                 E.g. to find news about Ukraine:\n",
    "                 token_dict = {'tags_bottom': ['Украин', 'Киев'],\n",
    "                               'body_token_final': ['украин', 'киев'],\n",
    "                               'title_token_final': ['украин', 'киев']}\n",
    "    Returns:\n",
    "    df - dataframe with added columns that indicate whether a news story\n",
    "         contains specified tokens\n",
    "    \"\"\"\n",
    "    new_columns = []\n",
    "    for key in token_dict:\n",
    "        searched_pattern = ('|').join(token_dict[key])\n",
    "        \n",
    "        # Add a column-indicator\n",
    "        new_col_name = key+'_contain'\n",
    "        new_columns.append(new_col_name)\n",
    "        df[new_col_name] = np.where(df[key].str.contains(searched_pattern)==True, 1, 0)\n",
    "        \n",
    "    # If at least one of the columns contains tokens, we can say that \n",
    "    # the news story contains tokens\n",
    "    df['contain_token'] = np.minimum(df[new_columns].sum(axis=1), 1)\n",
    "    \n",
    "    return df       "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ac5b4211",
   "metadata": {},
   "outputs": [],
   "source": [
    "def most_frequent(ser, n):\n",
    "    \"\"\"\n",
    "    Finds the most frequent n words \n",
    "    in the tokenized column\n",
    "    \n",
    "    Args: ser - series (from a tokenized column)\n",
    "          n - a number of the most common words\n",
    "    Rerurns: a Counter object with the most common words\n",
    "             (a list of tuples (word, frequency))\n",
    "    \n",
    "    \"\"\"\n",
    "    return Counter(\" \".join(ser.str.replace('[\\[\\',\\]]', '', regex=True)).split()).most_common(n)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3580bc2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_most_frequent(df, col, n, token_dict=None, groupby=None):\n",
    "    \"\"\"\n",
    "    Finds the n words that appear the most frequently in the \n",
    "    column \"col\".\n",
    "    If \"token_dict\" is defined - finds the most frequent words\n",
    "    only in rows that contain specified tokens.\n",
    "    \n",
    "    Args: *df - a dataframe with scraped data \n",
    "                (must be tokenized);\n",
    "          *col - string, a name of the tkenized column where the \n",
    "                 most frequent words need to be found;\n",
    "          *n - integer, a number of the most common words;\n",
    "          *token_dict - dictionary with specified tokens; \n",
    "          *groupby - string, a name of the column whose values are \n",
    "                     utilized for grouping. In order to find the most \n",
    "                     common words for each year, use groupby='year'. \n",
    "    \n",
    "    Returns: df_frequent - a dataframe containing Counter objects.  \n",
    "    \"\"\"\n",
    "    \n",
    "    # Find rows that contain tokens, specified in token_dict\n",
    "    # Filter these rows\n",
    "    if token_dict != None:\n",
    "        df = find_tokens(df, token_dict)\n",
    "        df = df[(df['contain_token']==1)]\n",
    "        \n",
    "    # Group by specified col\n",
    "    if groupby != None:\n",
    "        df_frequent = df.groupby(groupby).agg({col: lambda x: most_frequent(x, n)})\n",
    "    else:\n",
    "        df_frequent = most_frequent(df[col], n)\n",
    "        \n",
    "    return df_frequent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "684832dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def delete_words_from_list(frequent_words, words_to_delete):\n",
    "    \"\"\"\n",
    "    Deletes words from the list of tuples (word, frequency)\n",
    "    Args: \n",
    "    *frequent_words - *list of tuples (word, frequency),\n",
    "                       from which stop words must be removed;\n",
    "    *words_to_delete - list of stop words.\n",
    "    \n",
    "    Returns: \n",
    "    *frequent_words - dictionary {word: frequency} without \n",
    "                      words specified in words_to_delete.\n",
    "             \n",
    "    \"\"\"\n",
    "    frequent_words = dict(frequent_words)\n",
    "    \n",
    "    for key in words_to_delete:\n",
    "        if key in frequent_words:\n",
    "            del frequent_words[key]\n",
    "                \n",
    "    return frequent_words      "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "36bf4bcc",
   "metadata": {},
   "outputs": [],
   "source": [
    "def delete_stop_words(frequent_words, words_to_delete):\n",
    "    \"\"\"\n",
    "    Deletes words from the list of tuples (word, frequency)\n",
    "    or from dataframe\n",
    "    Args: \n",
    "    *frequent_words - *list of tuples (word, frequency),\n",
    "                       from which stop words must be removed;\n",
    "                      Or\n",
    "                      *dataframe that have a column with \n",
    "                       list of tuples (word, frequency).\n",
    "    *words_to_delete - list of stop words\n",
    "    \n",
    "    Returns: \n",
    "    *frequent_words - *dictionary {word: frequency} without \n",
    "                       words specified in words_to_delete\n",
    "                      Or\n",
    "                      *dataframe that have a column with \n",
    "                       dictionaries {word: frequency} \n",
    "    \"\"\"\n",
    "    \n",
    "    if type(frequent_words)==list:\n",
    "        frequent_words = delete_words_from_list(frequent_words, words_to_delete)\n",
    "    else:\n",
    "        frequent_words['frequent words'] = \\\n",
    "            frequent_words.iloc[:, 0].apply(delete_words_from_list, words_to_delete=words_to_delete)\n",
    "        \n",
    "    return frequent_words     "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c1ce6c20",
   "metadata": {},
   "outputs": [],
   "source": [
    "def merge_dict_keys(dict_freq, lists_for_merge):\n",
    "    \"\"\"\n",
    "    Merges all keys from the list into a single key. \n",
    "    The values associated with these keys are added together.\n",
    "    \n",
    "    Args:\n",
    "    *dict_freq - dictionary {word: frequency}\n",
    "    *lists_for_merge - list of lists; each list contains \n",
    "                       keys that should be merged\n",
    "    Returns:\n",
    "    *dict_freq - dictionary mith merged keys\n",
    "    \"\"\"\n",
    "         \n",
    "    # Make a copy of lists_for_merge so we don't change it\n",
    "    lists = copy.deepcopy(lists_for_merge)\n",
    "    \n",
    "    for word_lst in lists:\n",
    "               \n",
    "        # Make sure dict_freq contains elements with keys from \n",
    "        # lists_for_merge. If dict_freq does not contain such an \n",
    "        # element, we should remove it from the list.\n",
    "        for key in word_lst.copy():\n",
    "            if key not in dict_freq:\n",
    "                word_lst.remove(key)\n",
    "                \n",
    "        # Merge keys in the dict_freq\n",
    "        if len(word_lst) > 1:\n",
    "            for key in word_lst[1:]:\n",
    "                dict_freq[word_lst[0]] += dict_freq[key]\n",
    "                del dict_freq[key]   \n",
    "                \n",
    "    return dict_freq             "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "387be8d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def merge_words(frequent_words, lists_for_merge):\n",
    "    \"\"\"\n",
    "    Merges words from the list (with similar meaning) into one word \n",
    "    \n",
    "    Args: \n",
    "    *frequent_words - *dict {word, frequency},\n",
    "                       in which keys should be merged;\n",
    "                      Or\n",
    "                      *dataframe that have a column with \n",
    "                       dictionaries {word, frequency}.\n",
    "    *lists_for_merge - list of lists; each list contains \n",
    "                       keys that should be merged\n",
    "    \n",
    "    Returns: \n",
    "    *frequent_words - *dictionary with merged keys\n",
    "                      Or\n",
    "                      *dataframe that have a column with \n",
    "                       dictionaries (with merged keys) \n",
    "    \"\"\"\n",
    "    \n",
    "    if type(frequent_words)==dict:\n",
    "        frequent_words = merge_dict_keys(frequent_words, lists_for_merge)\n",
    "    else:\n",
    "        frequent_words['frequent words'] = \\\n",
    "            frequent_words.iloc[:, 1].apply(merge_dict_keys, lists_for_merge=lists_for_merge)\n",
    "        \n",
    "    return frequent_words     "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e87f7561",
   "metadata": {},
   "outputs": [],
   "source": [
    "def translate_ru_en(dict_ru, dict_ru_en):\n",
    "    \"\"\"\n",
    "    Translates keys in the input dictionary from RU to EN\n",
    "    \n",
    "    Args: dict_ru - dictionary with keys in Russian\n",
    "          dict_ru_en - RU-EN dictionary with better translation \n",
    "                       than automatic\n",
    "    Returns: translation - dataframe with translation\n",
    "    \"\"\"\n",
    "    keys_rus = list(dict_ru.keys())\n",
    "    \n",
    "    # Translate keys\n",
    "    keys_eng = []\n",
    "    for key in keys_rus:\n",
    "        \n",
    "        try:\n",
    "            keys_eng.append(tss.google(key, 'ru', 'en'))\n",
    "        except:\n",
    "            keys_eng.append(\" \")\n",
    "            \n",
    "    \n",
    "    \n",
    "    # Transform the dictionary into a dataframe\n",
    "    # Index here is a word in Russian\n",
    "    translation = pd.DataFrame.from_dict(dict_ru, \n",
    "                                         orient='index',\n",
    "                                         columns=['frequency'])\n",
    "    translation['eng'] = keys_eng\n",
    "        \n",
    "    \n",
    "    # Apply dictionary dict_ru_en \n",
    "    dict_ru_en_1 = dict_ru_en.copy()\n",
    "    \n",
    "    # Find keys in dict_ru_en_1 that are not present in translation.index\n",
    "    index_set = set(translation.index)\n",
    "    keys_set = set(dict_ru_en_1)\n",
    "    keys_to_delete = keys_set.difference(index_set)\n",
    "    \n",
    "    # Delete these keys\n",
    "    for key in dict_ru_en_1.copy():\n",
    "        if key in keys_to_delete:\n",
    "            del dict_ru_en_1[key]\n",
    "            \n",
    "    # Apply dictionary for remaining keys\n",
    "    for key in dict_ru_en_1:\n",
    "        translation.loc[key, 'eng'] = dict_ru_en_1[key]\n",
    "        \n",
    "    # Delete articles \"a\", \"the\" from translation\n",
    "    translation[['article', 'wt article']]  = \\\n",
    "        translation['eng'].str.extract('(^the |^The |^a |^an |^A |^An |^to |^To | the$)(.*)')\n",
    "    \n",
    "    translation['eng'] = np.where(translation['wt article'].isnull(), \n",
    "                                  translation['eng'],\n",
    "                                  translation['wt article']) \n",
    "    \n",
    "    \n",
    "    return translation[['eng', 'frequency']]  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "55044cb3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def translate_df_of_dict(df, dict_ru_en):\n",
    "    \"\"\"\n",
    "    Translates df with dictionaries \n",
    "    \"\"\"\n",
    "    # Add all the dictionaries, so we don't translate a word twice\n",
    "    sum_counter = Counter(df.iloc[0, 1])\n",
    "    \n",
    "    for i in range(1, len(df)):\n",
    "        sum_counter += Counter(df.iloc[i, 1])\n",
    "        \n",
    "    translate = translate_ru_en(dict(sum_counter), dict_ru_en) \n",
    "    \n",
    "    return translate"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}