forked from insightcampus/sesac-nlp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
31 실습 - 통계기반 자연어처리 - 감성분석
1 lines (1 loc) · 16.7 KB
/
31 실습 - 통계기반 자연어처리 - 감성분석
1
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"31 실습 - 통계기반 자연어처리 - 감성분석","provenance":[],"collapsed_sections":[]},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"yPyMXdYHE0wn"},"source":["#감성분석(Sentiment Analysis)"]},{"cell_type":"markdown","metadata":{"id":"j61gv6QEE71q"},"source":["#1 사전기반 감성분석"]},{"cell_type":"code","metadata":{"id":"rVgDReTWWlDi","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1637505569413,"user_tz":-540,"elapsed":721,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"68d3fa2b-362b-4121-bb20-f2999372a9aa"},"source":["#KnuSentiLex 다운로드\n","!wget https://raw.githubusercontent.com/park1200656/KnuSentiLex/master/pos_pol_word.txt\n","!wget https://raw.githubusercontent.com/park1200656/KnuSentiLex/master/neg_pol_word.txt"],"execution_count":15,"outputs":[{"output_type":"stream","name":"stdout","text":["--2021-11-21 14:39:28-- https://raw.githubusercontent.com/park1200656/KnuSentiLex/master/pos_pol_word.txt\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 85007 (83K) [text/plain]\n","Saving to: ‘pos_pol_word.txt.1’\n","\n","\rpos_pol_word.txt.1 0%[ ] 0 --.-KB/s \rpos_pol_word.txt.1 100%[===================>] 83.01K --.-KB/s in 0.01s \n","\n","2021-11-21 14:39:29 (6.17 MB/s) - ‘pos_pol_word.txt.1’ saved [85007/85007]\n","\n","--2021-11-21 14:39:29-- https://raw.githubusercontent.com/park1200656/KnuSentiLex/master/neg_pol_word.txt\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 176542 (172K) [text/plain]\n","Saving to: ‘neg_pol_word.txt.1’\n","\n","neg_pol_word.txt.1 100%[===================>] 172.40K --.-KB/s in 0.02s \n","\n","2021-11-21 14:39:29 (6.86 MB/s) - ‘neg_pol_word.txt.1’ saved [176542/176542]\n","\n"]}]},{"cell_type":"code","metadata":{"id":"p3o_a5Zoc7Qy","executionInfo":{"status":"ok","timestamp":1637505778450,"user_tz":-540,"elapsed":255,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}}},"source":["dct = {}\n","\n","with open('pos_pol_word.txt', 'r') as f :\n"," dct['pos'] = f.read().split('\\n')[19:]\n","\n","with open('neg_pol_word.txt', 'r') as f :\n"," dct['neg'] = f.read().split('\\n')[19:] "],"execution_count":17,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"55OS-mEg81Eh","executionInfo":{"status":"ok","timestamp":1637505791431,"user_tz":-540,"elapsed":295,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"31132a04-2d3d-4f2b-c761-da05ec565ccb"},"source":["dct['pos'][:10]"],"execution_count":18,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['(-;',\n"," '(^^)',\n"," '(^-^)',\n"," '(^^*',\n"," '(^_^)',\n"," '(^o^)',\n"," '*^^*',\n"," '/^o^\\\\',\n"," ':(',\n"," \":'-(\"]"]},"metadata":{},"execution_count":18}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"MvUq9nuq84vs","executionInfo":{"status":"ok","timestamp":1637505805947,"user_tz":-540,"elapsed":266,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"71e97098-05db-4f3f-db53-6fd8d291ee44"},"source":["dct['neg'][:10]"],"execution_count":19,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['가난',\n"," '가난뱅이',\n"," '가난살이',\n"," '가난살이하다',\n"," '가난설음',\n"," '가난에',\n"," '가난에 쪼들려서',\n"," '가난하게',\n"," '가난하고',\n"," '가난하고 어렵다']"]},"metadata":{},"execution_count":19}]},{"cell_type":"code","metadata":{"id":"IzR_KFqQQ-pN","executionInfo":{"status":"ok","timestamp":1637505207090,"user_tz":-540,"elapsed":10,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}}},"source":["txt = \"코로나19의 여파로 전 세계 교사들과 학생들 모두 혼란스러운 시기를 보내고 있습니다. 초유의 상황에서도, 학생들을 위해 최선을 다하려는 선생님들의 노력이 계속되고 있는데요. 어려움 속에서, 선생님들은 어떤 방법으로 사랑을 전하고 있을까요? 뉴스G에서 전해드립니다. [리포트] 중국 저장성의 한 산골 마을. 초등학교 교사 왕진량 씨는 지난 2월 말부터, 매일 새벽 다섯 시면 부지런히 집을 나섭니다. 온라인 수업을 받을 수 없는, 깊은 산골 마을에 살고 있는 학생들을 찾아가기 위해서인데요. 선생님이 하루 동안 이동하는 거리는 대략 30km정도. 차도 없이 도보로 네 개의 마을을 돌아다니며 학생들을 만납니다. 매일 이어지는 강행군이지만, 오로지 아이들의 학습이 중단되어서는 안 된다는 생각뿐입니다. 혹시 모를 사태에 대비해 학생들과의 접촉은 최대한 줄입니다. 숙제를 내주고, 검사 후에 모르는 문제를 알려주는 식으로 일대일 수업을 진행하고 있는데요. 아이들에게 배우는 즐거움이 얼마나 소중한 것인지 잘 알기에, 선생님은 이렇게라도 수업을 할 수 있다는 데서 행복을 느낍니다. 영국의 한 초등학교 교사인 젠 포울스 씨는 매일 아침, 무거운 짐을 앞 뒤, 양 옆으로 짊어지고 씩씩하게 발걸음을 옮깁니다. 코로나19로 학교가 문을 닫은 뒤, 형편이 어려운 학생들을 위해 매일 78인분의 점심 도시락을 배달하고 있는데요. 선생님이 재직 중인 초등학교는 전체 학생의 41퍼센트가 무상 급식 대상자이기 때문입니다. 도시락의 무게는 18kg, 걸어야 하는 거리는 8km에 달하지만 기다리는 학생들을 생각하며 지치지 않고 발걸음을 재촉합니다. 학생들은 창문을 통해서 반갑게 인사하기도 하고, 선생님이 볼 수 있게 감사 메시지를 붙여 놓기도 하는데요. 선생님이 정성껏 준비해 손수 배달한 사랑의 도시락. 봉쇄된 도시의 굶주린 아이들에게 소중한 한 끼 식사, 그 이상의 의미가 되고 있습니다. 미국 사우스다코타 주의 중학교 수학 교사인 크리스 와바 씨는, 커다란 화이트보드를 들고 학생의 집을 찾았습니다. 온라인 수업 후, 학생에게 이메일로 방정식 풀이법에 대한 질문을 받았기 때문인데요. 이메일로 답변해주는 것보다 직접 풀이 과정을 보여주는 게 낫다는 생각이었죠. 깜짝 놀란 학생을 마주한 채, 선생님은 현관문 앞에서 열정적으로 문제를 풀기 시작했습니다. 이 열정적인 강의는, 학생이 풀이법을 완벽히 이해할 때까지 이어졌는데요. 바이러스는 전 세계 교실에 혼란을 불러왔지만, 선생님들의 노력은 저마다의 방식으로 계속되고 있습니다. 어려움 속에서도 학생들을 위해 안간힘을 쓰고 있는 모든 선생님들에게, 응원과 박수를 함께 보냅니다.\""],"execution_count":3,"outputs":[]},{"cell_type":"code","metadata":{"id":"ZTgyj8kJSCkG","executionInfo":{"status":"ok","timestamp":1637506097176,"user_tz":-540,"elapsed":265,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}}},"source":["def analze_sentiment_by_dic(txt) :\n"," pos = []\n","\n"," for p in dct['pos'] :\n"," if p in txt :\n"," pos.append(p)\n","\n"," neg = []\n","\n"," for n in dct['neg'] :\n"," if n in txt :\n"," neg.append(n) \n","\n"," return ((len(pos)/(len(pos)+len(neg)),pos[:5]), (len(neg)/(len(pos)+len(neg)),neg[:5]))\n","\n"],"execution_count":26,"outputs":[]},{"cell_type":"code","metadata":{"id":"TVRRF_5SSiNf","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1637506101816,"user_tz":-540,"elapsed":245,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"7573e79f-1a8f-4070-ed4c-3191dc4c689f"},"source":["analze_sentiment_by_dic(txt)"],"execution_count":27,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((0.6470588235294118, ['낫다', '대상', '사랑을', '씩씩하게', '열정']),\n"," (0.35294117647058826, ['방정', '어려운', '어려움', '해', '혼란스러운']))"]},"metadata":{},"execution_count":27}]},{"cell_type":"markdown","metadata":{"id":"anKdEE0eEk8z"},"source":["# 2 나이브 베이즈 분류기 활용 감성분석"]},{"cell_type":"markdown","metadata":{"id":"xTVyt3BRKY7M"},"source":["## 1) 간단 예제"]},{"cell_type":"code","metadata":{"id":"IoLvW8daCLbo","executionInfo":{"status":"ok","timestamp":1637505207092,"user_tz":-540,"elapsed":8,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}}},"source":["text = [(\"I love you\", \"P\"),\n"," (\"love happy weekend\", \"P\"),\n"," (\"bore work job\", \"N\"),\n"," (\"I hate you\", \"N\"),\n"," (\"bore weekend\", \"N\"),\n"," (\"happy together\", \"P\")]"],"execution_count":6,"outputs":[]},{"cell_type":"code","metadata":{"id":"tUNAlJcICqQj","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1637506187166,"user_tz":-540,"elapsed":275,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"7514bb4c-41a8-486e-e31c-33aac740b1fb"},"source":["X_train = [t[0] for t in text]\n","Y_train = [t[1] for t in text]\n","\n","X_train"],"execution_count":28,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['I love you',\n"," 'love happy weekend',\n"," 'bore work job',\n"," 'I hate you',\n"," 'bore weekend',\n"," 'happy together']"]},"metadata":{},"execution_count":28}]},{"cell_type":"code","metadata":{"id":"b8MGr_sZibyp","executionInfo":{"status":"ok","timestamp":1637506312888,"user_tz":-540,"elapsed":264,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}}},"source":["from sklearn.feature_extraction.text import CountVectorizer\n","from sklearn.naive_bayes import MultinomialNB\n","\n","count_vect = CountVectorizer()\n","X_train_counts = count_vect.fit_transform(X_train)\n","clf = MultinomialNB().fit(X_train_counts, Y_train)"],"execution_count":29,"outputs":[]},{"cell_type":"code","metadata":{"id":"x8zJ50ooDOTq","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1637506483813,"user_tz":-540,"elapsed":273,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"ac008638-a2b5-4f6e-ccb3-ebad21a69f3d"},"source":["print(clf.predict(count_vect.transform(['happy weekend'])))\n","print(clf.classes_)\n","print(clf.predict_proba(count_vect.transform(['happy weekend'])))"],"execution_count":35,"outputs":[{"output_type":"stream","name":"stdout","text":["['P']\n","['N' 'P']\n","[[0.25 0.75]]\n"]}]},{"cell_type":"code","metadata":{"id":"4r34wFrfGzOo","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1637506461917,"user_tz":-540,"elapsed":325,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"201fa721-13fd-44ab-f4bb-154d2c298e19"},"source":[""],"execution_count":34,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array(['N', 'P'], dtype='<U1')"]},"metadata":{},"execution_count":34}]},{"cell_type":"markdown","metadata":{"id":"ZFINscB8FS7Q"},"source":["## 2) 네이버 영화 리뷰 감성분석"]},{"cell_type":"code","metadata":{"id":"ovFtoItjDfFs","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1637506504592,"user_tz":-540,"elapsed":727,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"b27620fb-2c98-494c-c3da-64cbf9641c4a"},"source":["# 네이버 영화 리뷰 다운로드\n","!wget https://github.com/e9t/nsmc/raw/master/ratings.txt"],"execution_count":36,"outputs":[{"output_type":"stream","name":"stdout","text":["--2021-11-21 14:55:03-- https://github.com/e9t/nsmc/raw/master/ratings.txt\n","Resolving github.com (github.com)... 140.82.114.4\n","Connecting to github.com (github.com)|140.82.114.4|:443... connected.\n","HTTP request sent, awaiting response... 302 Found\n","Location: https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt [following]\n","--2021-11-21 14:55:04-- https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 19515078 (19M) [text/plain]\n","Saving to: ‘ratings.txt.1’\n","\n","ratings.txt.1 100%[===================>] 18.61M --.-KB/s in 0.1s \n","\n","2021-11-21 14:55:04 (127 MB/s) - ‘ratings.txt.1’ saved [19515078/19515078]\n","\n"]}]},{"cell_type":"code","metadata":{"id":"gQIY8T7NFe0n","executionInfo":{"status":"ok","timestamp":1637506511154,"user_tz":-540,"elapsed":709,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}}},"source":["import pandas as pd\n","import numpy as np\n","df = pd.read_csv(\"./ratings.txt\",sep='\\t').dropna()"],"execution_count":37,"outputs":[]},{"cell_type":"code","metadata":{"id":"gJnjV-KQFnga","executionInfo":{"status":"ok","timestamp":1637506654733,"user_tz":-540,"elapsed":3440,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}}},"source":["from sklearn.model_selection import train_test_split\n","from sklearn.feature_extraction.text import CountVectorizer\n","from sklearn.naive_bayes import MultinomialNB\n","\n","X_train, X_test, Y_train, Y_test = train_test_split(df['document'], df['label'], random_state=0)\n","\n","count_vect = CountVectorizer()\n","X_train_counts = count_vect.fit_transform(X_train)\n","clf = MultinomialNB().fit(X_train_counts, Y_train) "],"execution_count":39,"outputs":[]},{"cell_type":"code","metadata":{"id":"2Qecu_93Hwu7","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1637506731800,"user_tz":-540,"elapsed":654,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"0855610e-19fa-45d1-9178-f9aa560bd4ce"},"source":["X_new_counts = count_vect.transform(X_test)\n","\n","predicted = clf.predict(X_new_counts)\n","np.mean(predicted == Y_test)"],"execution_count":40,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.8308732349293971"]},"metadata":{},"execution_count":40}]},{"cell_type":"code","metadata":{"id":"xPzdSWaRFvV1","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1637505212709,"user_tz":-540,"elapsed":421,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"d472bd69-4013-4126-e988-64139f89c783"},"source":[""],"execution_count":14,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.8300132005280211"]},"metadata":{},"execution_count":14}]}]}