forked from insightcampus/sesac-nlp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path26 실습 - 통계기반 자연어처리 - 토픽모델링 (LSA)
1 lines (1 loc) · 10.2 KB
/
26 실습 - 통계기반 자연어처리 - 토픽모델링 (LSA)
1
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"26 실습 - 통계기반 자연어처리 - 토픽모델링 (LSA)","provenance":[],"collapsed_sections":[]},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"Jyp5nHXQsXez"},"source":["# 잠재의미분석 (Latent Semantic Analysis LSA)"]},{"cell_type":"markdown","metadata":{"id":"rAQyWNVQWAh8"},"source":["## 1) 직접 구현"]},{"cell_type":"code","metadata":{"id":"0X8TJKxn9Nvk","executionInfo":{"status":"ok","timestamp":1638099375968,"user_tz":-540,"elapsed":379,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}}},"source":["docs = ['바나나 사과 포도 포도 짜장면',\n"," '사과 포도',\n"," '포도 바나나 짜장면',\n"," '짜장면 짬뽕 탕수육',\n"," '볶음밥 탕수육',\n"," '짜장면 짬뽕',\n"," '라면 스시 짜장면',\n"," '스시 ',\n"," '가츠동 스시 소바',\n"," '된장찌개 김치찌개 김치',\n"," '김치 된장 짜장면',\n"," '비빔밥 김치'\n"," ]\n","\n","k = 4"],"execution_count":25,"outputs":[]},{"cell_type":"code","metadata":{"id":"TRNB7P9Y9Hav","executionInfo":{"status":"ok","timestamp":1638099377172,"user_tz":-540,"elapsed":2,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}}},"source":["from sklearn.feature_extraction.text import CountVectorizer\n","\n","cv = CountVectorizer()\n","DTM = cv.fit_transform(docs).toarray()\n","feature_name = cv.get_feature_names_out()\n","word2id = cv.vocabulary_"],"execution_count":26,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vlIMXqhm9zrq","executionInfo":{"status":"ok","timestamp":1638099377806,"user_tz":-540,"elapsed":3,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"4b3b144e-568f-4f55-fa04-583b43b92448"},"source":["from sklearn.decomposition import randomized_svd\n","\n","U, s, VT = randomized_svd(DTM, n_components = k, n_iter=10, random_state = 0)\n","\n","for topic in VT :\n"," print([feature_name[i] for i in topic.argsort()[::-1][:3]])\n"],"execution_count":27,"outputs":[{"output_type":"stream","name":"stdout","text":["['포도', '짜장면', '바나나']\n","['짜장면', '김치', '짬뽕']\n","['김치', '된장찌개', '김치찌개']\n","['스시', '김치', '소바']\n"]}]},{"cell_type":"code","metadata":{"id":"imptYJYB-y6P","executionInfo":{"status":"ok","timestamp":1638099364631,"user_tz":-540,"elapsed":2,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}}},"source":[""],"execution_count":24,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"31Kn3iEWLDz9"},"source":["## 2) sklearn 활용"]},{"cell_type":"code","metadata":{"id":"2LVVXaR4EJCZ","executionInfo":{"status":"ok","timestamp":1638099704442,"user_tz":-540,"elapsed":288,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}}},"source":["doc_ls = ['바나나 사과 포도 포도 ',\n"," '사과 포도',\n"," '포도 바나나',\n"," '짜장면 짬뽕 탕수육',\n"," '볶음밥 탕수육',\n"," '짜장면 짬뽕',\n"," '라면 스시',\n"," '스시 ',\n"," '가츠동 스시 소바',\n"," '된장찌개 김치찌개 김치',\n"," '김치 된장 ',\n"," '비빔밥 김치'\n"," ]\n"],"execution_count":35,"outputs":[]},{"cell_type":"code","metadata":{"id":"ZOPy5VKzKr8_","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1638099705139,"user_tz":-540,"elapsed":392,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"932696e6-dc8c-44f4-88a0-524d3b934997"},"source":["from sklearn.feature_extraction.text import TfidfVectorizer\n","from sklearn.decomposition import TruncatedSVD\n","\n","n_topics = 4\n","\n","tfidfv = TfidfVectorizer()\n","tfidf = tfidfv.fit_transform(docs)\n","svd = TruncatedSVD(n_components = n_topics, algorithm = 'randomized', n_iter=100)\n","svd.fit_transform(tfidf)"],"execution_count":36,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 8.61105388e-01, -4.75254796e-01, 1.67660141e-01,\n"," 5.82844765e-03],\n"," [ 6.18930746e-01, -5.26148914e-01, 2.18806894e-01,\n"," 9.44875081e-03],\n"," [ 8.09132632e-01, -2.80973698e-01, 6.93387989e-02,\n"," 7.57422886e-04],\n"," [ 5.09622369e-01, 4.91582001e-01, -5.42852998e-01,\n"," -3.26022551e-01],\n"," [ 1.29278058e-01, 2.17214108e-01, -3.02980869e-01,\n"," -2.55160201e-01],\n"," [ 5.34425135e-01, 4.44458067e-01, -4.46741772e-01,\n"," -2.20377812e-01],\n"," [ 3.78608354e-01, 5.60888740e-01, 4.25234381e-01,\n"," 1.45780200e-03],\n"," [ 1.56823486e-01, 5.38919979e-01, 6.76134716e-01,\n"," 9.88044613e-03],\n"," [ 1.04905563e-01, 4.23962791e-01, 5.74924409e-01,\n"," 9.45944188e-03],\n"," [ 6.78523187e-02, 1.08042666e-01, -1.51061578e-01,\n"," 6.71504426e-01],\n"," [ 3.40457894e-01, 2.56988313e-01, -2.49185034e-01,\n"," 5.82922053e-01],\n"," [ 8.06392605e-02, 1.23193798e-01, -1.68550388e-01,\n"," 7.23137085e-01]])"]},"metadata":{},"execution_count":36}]},{"cell_type":"code","metadata":{"id":"fkHor8JeRuFz","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1638099706539,"user_tz":-540,"elapsed":291,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"9ec9d3f0-aea1-4b95-9510-de5327886a5a"},"source":["for idx, topic in enumerate(svd.components_) :\n"," print([feature_name[i] for i in topic.argsort()[::-1][:3]])"],"execution_count":37,"outputs":[{"output_type":"stream","name":"stdout","text":["['포도', '짜장면', '바나나']\n","['스시', '짬뽕', '짜장면']\n","['스시', '소바', '가츠동']\n","['김치', '비빔밥', '된장']\n"]}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"XQPbtrruV_rB","executionInfo":{"status":"ok","timestamp":1638099625106,"user_tz":-540,"elapsed":406,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"3eaf12f9-e0de-4bf0-beb6-f4ca2b5999e0"},"source":["\n","\n","\n"],"execution_count":32,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['포도', '짜장면', '바나나']"]},"metadata":{},"execution_count":32}]},{"cell_type":"markdown","metadata":{"id":"QiJXdInJLn-4"},"source":["##3) gensim 활용"]},{"cell_type":"code","metadata":{"id":"lz82ZZpkLr9I","executionInfo":{"status":"ok","timestamp":1638099844422,"user_tz":-540,"elapsed":298,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}}},"source":["docs = ['바나나 사과 포도 포도',\n"," '사과 포도',\n"," '포도 바나나',\n"," '짜장면 짬뽕 탕수욕',\n"," '볶음밥 탕수욕',\n"," '짜장면 짬뽕',\n"," '라면 스시',\n"," '스시',\n"," '가츠동 스시 소바',\n"," '된장찌개 김치찌개 김치',\n"," '김치 된장',\n"," '비빔밥 김치'\n"," ]"],"execution_count":38,"outputs":[]},{"cell_type":"code","metadata":{"id":"JjZIWeWuL1NT","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1638099875891,"user_tz":-540,"elapsed":311,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"07d1106c-7ccd-44cd-bb51-6b4d55088ac6"},"source":["doc_ls = [doc.split() for doc in docs]\n","doc_ls[0]"],"execution_count":40,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['바나나', '사과', '포도', '포도']"]},"metadata":{},"execution_count":40}]},{"cell_type":"code","metadata":{"id":"eCcg7TSHL5ZR","executionInfo":{"status":"ok","timestamp":1638100003669,"user_tz":-540,"elapsed":307,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}}},"source":["from gensim import corpora\n","from gensim.models import LsiModel\n","from gensim.models import TfidfModel\n","\n","id2word = corpora.Dictionary(doc_ls)\n","corpus_TDM = [id2word.doc2bow(t) for t in doc_ls]\n","model_LSA = LsiModel(corpus_TDM, id2word=id2word, num_topics = 4)\n"],"execution_count":43,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"iYyc-JmCXfun","executionInfo":{"status":"ok","timestamp":1638100028450,"user_tz":-540,"elapsed":311,"user":{"displayName":"이민호","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiFPPatrtQJJCEfMd6D3DoTVRog9gVm7Ovj5Lex=s64","userId":"15829449822908558555"}},"outputId":"d661b556-c82c-4cfe-c07e-35b36f925b38"},"source":["model_LSA.print_topics(4, 3)"],"execution_count":44,"outputs":[{"output_type":"execute_result","data":{"text/plain":["[(0, '0.816*\"포도\" + 0.408*\"바나나\" + 0.408*\"사과\"'),\n"," (1, '0.612*\"짜장면\" + 0.612*\"짬뽕\" + 0.484*\"탕수욕\"'),\n"," (2, '-0.813*\"김치\" + -0.337*\"된장찌개\" + -0.337*\"김치찌개\"'),\n"," (3, '-0.815*\"스시\" + -0.368*\"소바\" + -0.368*\"가츠동\"')]"]},"metadata":{},"execution_count":44}]},{"cell_type":"code","metadata":{"id":"Dh4NKtwVXsb1"},"source":[""],"execution_count":null,"outputs":[]}]}