Skip to content

Commit

Permalink
update notebooks; move functions to utils
Browse files Browse the repository at this point in the history
  • Loading branch information
Leavingseason committed Jul 27, 2020
1 parent 6238d41 commit 171d244
Show file tree
Hide file tree
Showing 9 changed files with 488 additions and 233 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,5 @@ nohup.out
##### kdd 2020 tutorial data folder
scenarios/KDD2020-tutorial/data_folder/
scenarios/KDD2020-tutorial/.ipynb_checkpoints/
scenarios/academic/KDD2020-tutorial/data_folder/
scenarios/academic/KDD2020-tutorial/.ipynb_checkpoints/
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,13 @@
"\n",
"2. LightGCN implementation [TensorFlow]: https://github.com/kuandeng/lightgcn"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
1 change: 0 additions & 1 deletion reco_utils/recommender/deeprec/models/dkn_item2item.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def _build_doc_embedding(self, candidate_word_batch, candidate_entity_batch):
news_field_embed = tf.tanh(tf.matmul(news_field_embed, W))
return news_field_embed


def eval(self, sess, feed_dict):
feed_dict[self.layer_keeps] = self.keep_prob_test
feed_dict[self.is_train_stage] = False
Expand Down
143 changes: 127 additions & 16 deletions scenarios/academic/KDD2020-tutorial/step1_data_preparation.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -77,15 +77,15 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"start to train word embedding... \tdone . \n",
"time elapses: 649.8s\n"
"time elapses: 436.0s\n"
]
}
],
Expand Down Expand Up @@ -123,25 +123,25 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/data/home/jialia/jialia/kdd2020tutorial/formal_02/recommenders/scenarios/KDD2020-tutorial\n",
"/data/home/jialia/jialia/kdd2020tutorial/formal_03/recommenders/scenarios/academic/KDD2020-tutorial\n",
"fatal: destination path 'Fast-TransX' already exists and is not an empty directory.\n",
"epoch 0 454690.656250\n",
"epoch 1 376927.000000\n",
"epoch 2 344530.656250\n",
"epoch 3 315695.781250\n",
"epoch 4 290692.281250\n",
"epoch 5 268658.906250\n",
"epoch 6 250159.546875\n",
"epoch 7 231006.828125\n",
"epoch 8 215869.140625\n",
"epoch 9 200701.406250\n"
"epoch 0 457997.468750\n",
"epoch 1 381886.843750\n",
"epoch 2 342276.531250\n",
"epoch 3 313029.500000\n",
"epoch 4 285492.718750\n",
"epoch 5 262623.656250\n",
"epoch 6 240311.156250\n",
"epoch 7 221671.421875\n",
"epoch 8 205992.953125\n",
"epoch 9 191884.171875\n"
]
}
],
Expand All @@ -166,7 +166,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -175,12 +175,12 @@
"entity_file = os.path.join(OutFile_dir_KG, 'entity2vec.vec') \n",
"context_file = os.path.join(OutFile_dir_KG, 'context2vec.vec') \n",
"kg_file = os.path.join(OutFile_dir_KG, 'train2id.txt') \n",
"gen_context_embedding(entity_file, context_file, kg_file)"
"gen_context_embedding(entity_file, context_file, kg_file, dim=EMBEDDING_LENGTH)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -206,6 +206,17 @@
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Reference\n",
"\\[1\\] Wang, Hongwei, et al. \"DKN: Deep Knowledge-Aware Network for News Recommendation.\" Proceedings of the 2018 World Wide Web Conference on World Wide Web. International World Wide Web Conferences Steering Committee, 2018.<br>\n",
"\\[2\\] Knowledge Graph Embeddings including TransE, TransH, TransR and PTransE. https://github.com/thunlp/KB2E <br>\n",
" of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html <br>\n",
"\\[3\\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
70 changes: 44 additions & 26 deletions scenarios/academic/KDD2020-tutorial/step3_run_dkn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
"outputs": [],
"source": [
"import sys\n",
"sys.path.append(\"../../\")\n",
"sys.path.append(\"../../../\")\n",
"from reco_utils.recommender.deeprec.deeprec_utils import *\n",
"from reco_utils.recommender.deeprec.models.dkn import *\n",
"from reco_utils.recommender.deeprec.io.dkn_iterator import *\n",
Expand All @@ -73,7 +73,17 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## data paths"
"## data paths\n",
"Usually we will debug and search hyper-parameters on a small dataset. You can switch between the small dataset and full dataset by changing the value of `tag`."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"tag = 'full' # small or full"
]
},
{
Expand All @@ -87,7 +97,7 @@
"outputs": [],
"source": [
"data_path = 'data_folder/my/DKN-training-folder'\n",
"tag = 'small'\n",
"\n",
"yaml_file = './dkn.yaml' # os.path.join(data_path, r'../../../../../../dkn.yaml')\n",
"train_file = os.path.join(data_path, r'train_{0}.txt'.format(tag))\n",
"valid_file = os.path.join(data_path, r'valid_{0}.txt'.format(tag))\n",
Expand Down Expand Up @@ -121,7 +131,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"kg_file=None,user_clicks=None,FEATURE_COUNT=None,FIELD_COUNT=None,data_format=dkn,PAIR_NUM=None,DNN_FIELD_NUM=None,n_user=None,n_item=None,n_user_attr=None,n_item_attr=None,iterator_type=None,SUMMARIES_DIR=None,MODEL_DIR=data_folder/my/DKN-training-folder/save_models,wordEmb_file=data_folder/my/DKN-training-folder/word_embedding.npy,entityEmb_file=data_folder/my/DKN-training-folder/entity_embedding.npy,contextEmb_file=data_folder/my/DKN-training-folder/context_embedding.npy,news_feature_file=data_folder/my/DKN-training-folder/../paper_feature.txt,user_history_file=data_folder/my/DKN-training-folder/user_history_small.txt,use_entity=True,use_context=True,doc_size=15,history_size=20,word_size=194755,entity_size=57267,entity_dim=32,entity_embedding_method=TransE,transform=True,train_ratio=None,dim=32,layer_sizes=[300],cross_layer_sizes=None,cross_layers=None,activation=['sigmoid'],cross_activation=identity,user_dropout=False,dropout=[0.0],attention_layer_sizes=32,attention_activation=relu,attention_dropout=0.0,model_type=dkn,method=classification,load_saved_model=False,load_model_name=None,filter_sizes=[1, 2, 3],num_filters=50,mu=None,fast_CIN_d=0,use_Linear_part=False,use_FM_part=False,use_CIN_part=False,use_DNN_part=False,init_method=uniform,init_value=0.01,embed_l2=0.0,embed_l1=0.0,layer_l2=0.0,layer_l1=0.0,cross_l2=0.0,cross_l1=0.0,reg_kg=0.0,learning_rate=0.002,lr_rs=1,lr_kg=0.5,kg_training_interval=5,max_grad_norm=0.5,is_clip_norm=True,dtype=32,loss=log_loss,optimizer=adam,epochs=10,batch_size=100,enable_BN=False,show_step=10000,save_model=True,save_epoch=1,metrics=['auc'],write_tfevents=False,item_embedding_dim=None,cate_embedding_dim=None,user_embedding_dim=None,train_num_ngs=4,need_sample=True,embedding_dropout=0.3,user_vocab=None,item_vocab=None,cate_vocab=None,pairwise_metrics=['group_auc', 'mean_mrr', 'ndcg@2;4;6'],EARLY_STOP=100,max_seq_length=None,hidden_size=None,L=None,T=None,n_v=None,n_h=None,min_seq_length=1,attention_size=None,att_fcn_layer_sizes=None,dilations=None,kernel_size=None,embed_size=None,n_layers=None,decay=None,eval_epoch=None,top_k=None\n"
"kg_file=None,user_clicks=None,FEATURE_COUNT=None,FIELD_COUNT=None,data_format=dkn,PAIR_NUM=None,DNN_FIELD_NUM=None,n_user=None,n_item=None,n_user_attr=None,n_item_attr=None,iterator_type=None,SUMMARIES_DIR=None,MODEL_DIR=data_folder/my/DKN-training-folder/save_models,wordEmb_file=data_folder/my/DKN-training-folder/word_embedding.npy,entityEmb_file=data_folder/my/DKN-training-folder/entity_embedding.npy,contextEmb_file=data_folder/my/DKN-training-folder/context_embedding.npy,news_feature_file=data_folder/my/DKN-training-folder/../paper_feature.txt,user_history_file=data_folder/my/DKN-training-folder/user_history_full.txt,use_entity=True,use_context=True,doc_size=15,history_size=20,word_size=194755,entity_size=57267,entity_dim=32,entity_embedding_method=TransE,transform=True,train_ratio=None,dim=32,layer_sizes=[300],cross_layer_sizes=None,cross_layers=None,activation=['sigmoid'],cross_activation=identity,user_dropout=False,dropout=[0.0],attention_layer_sizes=32,attention_activation=relu,attention_dropout=0.0,model_type=dkn,method=classification,load_saved_model=False,load_model_name=None,filter_sizes=[1, 2, 3],num_filters=50,mu=None,fast_CIN_d=0,use_Linear_part=False,use_FM_part=False,use_CIN_part=False,use_DNN_part=False,init_method=uniform,init_value=0.01,embed_l2=0.0,embed_l1=0.0,layer_l2=0.0,layer_l1=0.0,cross_l2=0.0,cross_l1=0.0,reg_kg=0.0,learning_rate=0.002,lr_rs=1,lr_kg=0.5,kg_training_interval=5,max_grad_norm=0.5,is_clip_norm=True,dtype=32,loss=log_loss,optimizer=adam,epochs=10,batch_size=100,enable_BN=False,show_step=10000,save_model=True,save_epoch=1,metrics=['auc'],write_tfevents=False,item_embedding_dim=None,cate_embedding_dim=None,user_embedding_dim=None,train_num_ngs=4,need_sample=True,embedding_dropout=0.3,user_vocab=None,item_vocab=None,cate_vocab=None,pairwise_metrics=['group_auc', 'mean_mrr', 'ndcg@2;4;6'],EARLY_STOP=100,max_seq_length=None,hidden_size=None,L=None,T=None,n_v=None,n_h=None,min_seq_length=1,attention_size=None,att_fcn_layer_sizes=None,dilations=None,kernel_size=None,embed_size=None,n_layers=None,decay=None,eval_epoch=None,top_k=None\n"
]
}
],
Expand Down Expand Up @@ -194,8 +204,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'auc': 0.5017, 'group_auc': 0.5008, 'mean_mrr': 0.4522, 'ndcg@2': 0.321, 'ndcg@4': 0.5095, 'ndcg@6': 0.5866}\n",
"0.24880879720052082\n"
"{'auc': 0.5081, 'group_auc': 0.5099, 'mean_mrr': 0.4596, 'ndcg@2': 0.333, 'ndcg@4': 0.5194, 'ndcg@6': 0.5922}\n",
"3.487539263566335\n"
]
}
],
Expand All @@ -208,7 +218,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {
"pycharm": {
"is_executing": false
Expand All @@ -220,22 +230,33 @@
"name": "stdout",
"output_type": "stream",
"text": [
"step 10000 , total_loss: 0.2335, data_loss: 0.2335\n",
"step 20000 , total_loss: 0.1945, data_loss: 0.1945\n",
"step 30000 , total_loss: 0.3731, data_loss: 0.3731\n",
"step 40000 , total_loss: 0.2881, data_loss: 0.2881\n",
"step 50000 , total_loss: 0.2333, data_loss: 0.2333\n",
"step 60000 , total_loss: 0.2471, data_loss: 0.2471\n",
"step 70000 , total_loss: 0.3202, data_loss: 0.3202\n",
"step 80000 , total_loss: 0.2380, data_loss: 0.2380\n",
"at epoch 1\n",
"train info: logloss loss:0.362814348739838\n",
"eval info: auc:0.8611, group_auc:0.85, mean_mrr:0.4799, ndcg@2:0.3917, ndcg@4:0.5043, ndcg@6:0.5552\n",
"at epoch 1 , train time: 141.9 eval time: 23.1\n",
"at epoch 2\n",
"train info: logloss loss:0.32036037545555107\n",
"eval info: auc:0.8716, group_auc:0.8594, mean_mrr:0.4977, ndcg@2:0.4136, ndcg@4:0.5274, ndcg@6:0.574\n",
"at epoch 2 , train time: 140.8 eval time: 23.0\n",
"at epoch 3\n",
"train info: logloss loss:0.3088067305876711\n",
"eval info: auc:0.8747, group_auc:0.8614, mean_mrr:0.5112, ndcg@2:0.4293, ndcg@4:0.537, ndcg@6:0.5852\n",
"at epoch 3 , train time: 141.0 eval time: 22.8\n",
"at epoch 4\n",
"train info: logloss loss:0.2974359316747665\n",
"eval info: auc:0.8839, group_auc:0.8759, mean_mrr:0.533, ndcg@2:0.456, ndcg@4:0.5662, ndcg@6:0.6096\n",
"at epoch 4 , train time: 140.8 eval time: 23.1\n"
"train info: logloss loss:0.26912009667484815\n",
"eval info: auc:0.9287, group_auc:0.9215, mean_mrr:0.6699, ndcg@2:0.6266, ndcg@4:0.7028, ndcg@6:0.7286\n",
"at epoch 1 , train time: 1626.1 eval time: 346.5\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-7-af5aaf72af27>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/data/home/jialia/jialia/kdd2020tutorial/formal_03/recommenders/reco_utils/recommender/deeprec/models/base_model.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, train_file, valid_file, test_file)\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[0mimpression\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 437\u001b[0m \u001b[0mdata_size\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 438\u001b[0;31m ) in self.iterator.load_data_from_file(train_file):\n\u001b[0m\u001b[1;32m 439\u001b[0m \u001b[0mstep_result\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_sess\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_data_input\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 440\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstep_loss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstep_data_loss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msummary\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstep_result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/data/home/jialia/jialia/kdd2020tutorial/formal_03/recommenders/reco_utils/recommender/deeprec/io/dkn_iterator.py\u001b[0m in \u001b[0;36mload_data_from_file\u001b[0;34m(self, infile)\u001b[0m\n\u001b[1;32m 170\u001b[0m \u001b[0mcandidate_news_entity_index_batch\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 171\u001b[0m \u001b[0mclick_news_entity_index_batch\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 172\u001b[0;31m \u001b[0mimpression_id_list\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 173\u001b[0m )\n\u001b[1;32m 174\u001b[0m \u001b[0mdata_size\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/data/home/jialia/jialia/kdd2020tutorial/formal_03/recommenders/reco_utils/recommender/deeprec/io/dkn_iterator.py\u001b[0m in \u001b[0;36m_convert_data\u001b[0;34m(self, label_list, candidate_news_index_batch, click_news_index_batch, candidate_news_entity_index_batch, click_news_entity_index_batch, impression_id_list)\u001b[0m\n\u001b[1;32m 293\u001b[0m )\n\u001b[1;32m 294\u001b[0m res[\"click_news_index_batch\"] = np.asarray(\n\u001b[0;32m--> 295\u001b[0;31m \u001b[0mclick_news_index_batch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mint64\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 296\u001b[0m )\n\u001b[1;32m 297\u001b[0m res[\"candidate_news_entity_index_batch\"] = np.asarray(\n",
"\u001b[0;32m~/.conda/envs/reco_gpu_kdd/lib/python3.6/site-packages/numpy/core/_asarray.py\u001b[0m in \u001b[0;36masarray\u001b[0;34m(a, dtype, order)\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 84\u001b[0m \"\"\"\n\u001b[0;32m---> 85\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 86\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
Expand Down Expand Up @@ -308,10 +329,7 @@
"metadata": {},
"source": [
"## Reference\n",
"\\[1\\] Wang, Hongwei, et al. \"DKN: Deep Knowledge-Aware Network for News Recommendation.\" Proceedings of the 2018 World Wide Web Conference on World Wide Web. International World Wide Web Conferences Steering Committee, 2018.<br>\n",
"\\[2\\] Knowledge Graph Embeddings including TransE, TransH, TransR and PTransE. https://github.com/thunlp/KB2E <br>\n",
"\\[3\\] Wu, Fangzhao, et al. \"MIND: A Large-scale Dataset for News Recommendation\" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html <br>\n",
"\\[4\\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/"
"\\[1\\] Wang, Hongwei, et al. \"DKN: Deep Knowledge-Aware Network for News Recommendation.\" Proceedings of the 2018 World Wide Web Conference on World Wide Web. International World Wide Web Conferences Steering Committee, 2018.<br>\n"
]
},
{
Expand Down
Loading

0 comments on commit 171d244

Please sign in to comment.