{ "cells": [ { "cell_type": "markdown", "id": "d7b6994b", "metadata": {}, "source": [ "# 3. Analysis\n", "## 3.0. Functions to use in analysis" ] }, { "cell_type": "code", "execution_count": 1, "id": "f91135b2", "metadata": {}, "outputs": [], "source": [ "def find_tokens(df, token_dict):\n", " \"\"\"\n", " Creates indicator for each row\n", " whether it contains specified tokens\n", " \n", " Args: \n", " df - analyzed dataframe\n", " token_dict - dictionary with specified tokens\n", " keys - column names\n", " values - a list of tokens\n", " E.g. to find news about Ukraine:\n", " token_dict = {'tags_bottom': ['Украин', 'Киев'],\n", " 'body_token_final': ['украин', 'киев'],\n", " 'title_token_final': ['украин', 'киев']}\n", " Returns:\n", " df - dataframe with added columns that indicate whether a news story\n", " contains specified tokens\n", " \"\"\"\n", " new_columns = []\n", " for key in token_dict:\n", " searched_pattern = ('|').join(token_dict[key])\n", " \n", " # Add a column-indicator\n", " new_col_name = key+'_contain'\n", " new_columns.append(new_col_name)\n", " df[new_col_name] = np.where(df[key].str.contains(searched_pattern)==True, 1, 0)\n", " \n", " # If at least one of the columns contains tokens, we can say that \n", " # the news story contains tokens\n", " df['contain_token'] = np.minimum(df[new_columns].sum(axis=1), 1)\n", " \n", " return df " ] }, { "cell_type": "code", "execution_count": 2, "id": "ac5b4211", "metadata": {}, "outputs": [], "source": [ "def most_frequent(ser, n):\n", " \"\"\"\n", " Finds the most frequent n words \n", " in the tokenized column\n", " \n", " Args: ser - series (from a tokenized column)\n", " n - a number of the most common words\n", " Rerurns: a Counter object with the most common words\n", " (a list of tuples (word, frequency))\n", " \n", " \"\"\"\n", " return Counter(\" \".join(ser.str.replace('[\\[\\',\\]]', '', regex=True)).split()).most_common(n) " ] }, { "cell_type": "code", "execution_count": 3, "id": "3580bc2d", "metadata": {}, "outputs": [], "source": [ "def find_most_frequent(df, col, n, token_dict=None, groupby=None):\n", " \"\"\"\n", " Finds the n words that appear the most frequently in the \n", " column \"col\".\n", " If \"token_dict\" is defined - finds the most frequent words\n", " only in rows that contain specified tokens.\n", " \n", " Args: *df - a dataframe with scraped data \n", " (must be tokenized);\n", " *col - string, a name of the tkenized column where the \n", " most frequent words need to be found;\n", " *n - integer, a number of the most common words;\n", " *token_dict - dictionary with specified tokens; \n", " *groupby - string, a name of the column whose values are \n", " utilized for grouping. In order to find the most \n", " common words for each year, use groupby='year'. \n", " \n", " Returns: df_frequent - a dataframe containing Counter objects. \n", " \"\"\"\n", " \n", " # Find rows that contain tokens, specified in token_dict\n", " # Filter these rows\n", " if token_dict != None:\n", " df = find_tokens(df, token_dict)\n", " df = df[(df['contain_token']==1)]\n", " \n", " # Group by specified col\n", " if groupby != None:\n", " df_frequent = df.groupby(groupby).agg({col: lambda x: most_frequent(x, n)})\n", " else:\n", " df_frequent = most_frequent(df[col], n)\n", " \n", " return df_frequent" ] }, { "cell_type": "code", "execution_count": 4, "id": "684832dd", "metadata": {}, "outputs": [], "source": [ "def delete_words_from_list(frequent_words, words_to_delete):\n", " \"\"\"\n", " Deletes words from the list of tuples (word, frequency)\n", " Args: \n", " *frequent_words - *list of tuples (word, frequency),\n", " from which stop words must be removed;\n", " *words_to_delete - list of stop words.\n", " \n", " Returns: \n", " *frequent_words - dictionary {word: frequency} without \n", " words specified in words_to_delete.\n", " \n", " \"\"\"\n", " frequent_words = dict(frequent_words)\n", " \n", " for key in words_to_delete:\n", " if key in frequent_words:\n", " del frequent_words[key]\n", " \n", " return frequent_words " ] }, { "cell_type": "code", "execution_count": 5, "id": "36bf4bcc", "metadata": {}, "outputs": [], "source": [ "def delete_stop_words(frequent_words, words_to_delete):\n", " \"\"\"\n", " Deletes words from the list of tuples (word, frequency)\n", " or from dataframe\n", " Args: \n", " *frequent_words - *list of tuples (word, frequency),\n", " from which stop words must be removed;\n", " Or\n", " *dataframe that have a column with \n", " list of tuples (word, frequency).\n", " *words_to_delete - list of stop words\n", " \n", " Returns: \n", " *frequent_words - *dictionary {word: frequency} without \n", " words specified in words_to_delete\n", " Or\n", " *dataframe that have a column with \n", " dictionaries {word: frequency} \n", " \"\"\"\n", " \n", " if type(frequent_words)==list:\n", " frequent_words = delete_words_from_list(frequent_words, words_to_delete)\n", " else:\n", " frequent_words['frequent words'] = \\\n", " frequent_words.iloc[:, 0].apply(delete_words_from_list, words_to_delete=words_to_delete)\n", " \n", " return frequent_words " ] }, { "cell_type": "code", "execution_count": 6, "id": "c1ce6c20", "metadata": {}, "outputs": [], "source": [ "def merge_dict_keys(dict_freq, lists_for_merge):\n", " \"\"\"\n", " Merges all keys from the list into a single key. \n", " The values associated with these keys are added together.\n", " \n", " Args:\n", " *dict_freq - dictionary {word: frequency}\n", " *lists_for_merge - list of lists; each list contains \n", " keys that should be merged\n", " Returns:\n", " *dict_freq - dictionary mith merged keys\n", " \"\"\"\n", " \n", " # Make a copy of lists_for_merge so we don't change it\n", " lists = copy.deepcopy(lists_for_merge)\n", " \n", " for word_lst in lists:\n", " \n", " # Make sure dict_freq contains elements with keys from \n", " # lists_for_merge. If dict_freq does not contain such an \n", " # element, we should remove it from the list.\n", " for key in word_lst.copy():\n", " if key not in dict_freq:\n", " word_lst.remove(key)\n", " \n", " # Merge keys in the dict_freq\n", " if len(word_lst) > 1:\n", " for key in word_lst[1:]:\n", " dict_freq[word_lst[0]] += dict_freq[key]\n", " del dict_freq[key] \n", " \n", " return dict_freq " ] }, { "cell_type": "code", "execution_count": 7, "id": "387be8d3", "metadata": {}, "outputs": [], "source": [ "def merge_words(frequent_words, lists_for_merge):\n", " \"\"\"\n", " Merges words from the list (with similar meaning) into one word \n", " \n", " Args: \n", " *frequent_words - *dict {word, frequency},\n", " in which keys should be merged;\n", " Or\n", " *dataframe that have a column with \n", " dictionaries {word, frequency}.\n", " *lists_for_merge - list of lists; each list contains \n", " keys that should be merged\n", " \n", " Returns: \n", " *frequent_words - *dictionary with merged keys\n", " Or\n", " *dataframe that have a column with \n", " dictionaries (with merged keys) \n", " \"\"\"\n", " \n", " if type(frequent_words)==dict:\n", " frequent_words = merge_dict_keys(frequent_words, lists_for_merge)\n", " else:\n", " frequent_words['frequent words'] = \\\n", " frequent_words.iloc[:, 1].apply(merge_dict_keys, lists_for_merge=lists_for_merge)\n", " \n", " return frequent_words " ] }, { "cell_type": "code", "execution_count": 8, "id": "e87f7561", "metadata": {}, "outputs": [], "source": [ "def translate_ru_en(dict_ru, dict_ru_en):\n", " \"\"\"\n", " Translates keys in the input dictionary from RU to EN\n", " \n", " Args: dict_ru - dictionary with keys in Russian\n", " dict_ru_en - RU-EN dictionary with better translation \n", " than automatic\n", " Returns: translation - dataframe with translation\n", " \"\"\"\n", " keys_rus = list(dict_ru.keys())\n", " \n", " # Translate keys\n", " keys_eng = []\n", " for key in keys_rus:\n", " \n", " try:\n", " keys_eng.append(tss.google(key, 'ru', 'en'))\n", " except:\n", " keys_eng.append(\" \")\n", " \n", " \n", " \n", " # Transform the dictionary into a dataframe\n", " # Index here is a word in Russian\n", " translation = pd.DataFrame.from_dict(dict_ru, \n", " orient='index',\n", " columns=['frequency'])\n", " translation['eng'] = keys_eng\n", " \n", " \n", " # Apply dictionary dict_ru_en \n", " dict_ru_en_1 = dict_ru_en.copy()\n", " \n", " # Find keys in dict_ru_en_1 that are not present in translation.index\n", " index_set = set(translation.index)\n", " keys_set = set(dict_ru_en_1)\n", " keys_to_delete = keys_set.difference(index_set)\n", " \n", " # Delete these keys\n", " for key in dict_ru_en_1.copy():\n", " if key in keys_to_delete:\n", " del dict_ru_en_1[key]\n", " \n", " # Apply dictionary for remaining keys\n", " for key in dict_ru_en_1:\n", " translation.loc[key, 'eng'] = dict_ru_en_1[key]\n", " \n", " # Delete articles \"a\", \"the\" from translation\n", " translation[['article', 'wt article']] = \\\n", " translation['eng'].str.extract('(^the |^The |^a |^an |^A |^An |^to |^To | the$)(.*)')\n", " \n", " translation['eng'] = np.where(translation['wt article'].isnull(), \n", " translation['eng'],\n", " translation['wt article']) \n", " \n", " \n", " return translation[['eng', 'frequency']] " ] }, { "cell_type": "code", "execution_count": 9, "id": "55044cb3", "metadata": {}, "outputs": [], "source": [ "def translate_df_of_dict(df, dict_ru_en):\n", " \"\"\"\n", " Translates df with dictionaries \n", " \"\"\"\n", " # Add all the dictionaries, so we don't translate a word twice\n", " sum_counter = Counter(df.iloc[0, 1])\n", " \n", " for i in range(1, len(df)):\n", " sum_counter += Counter(df.iloc[i, 1])\n", " \n", " translate = translate_ru_en(dict(sum_counter), dict_ru_en) \n", " \n", " return translate" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 5 }