Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[HFUT_Student]运行CL4KT模型时出现问题(附解决方法) #39

Open
yujiebing opened this issue Dec 29, 2024 · 1 comment
Open

Comments

@yujiebing
Copy link

Traceback (most recent call last):
File "D:\major assignment\examples\single_model\run_cl4kt_demo.py", line 9, in
run_edustudio(
File "D:\major assignment\edustudio\quickstart\quickstart.py", line 72, in run_edustudio
raise e
File "D:\major assignment\edustudio\quickstart\quickstart.py", line 57, in run_edustudio
traintpl = cls(cfg)
File "D:\major assignment\edustudio\traintpl\gd_traintpl.py", line 25, in init
super().init(cfg)
File "D:\major assignment\edustudio\traintpl\base_traintpl.py", line 54, in init
self.datatpl = self.get_datatpl_obj()
File "D:\major assignment\edustudio\traintpl\base_traintpl.py", line 87, in get_datatpl_obj
datatpl: BaseDataTPL = self.datatpl_cls.from_cfg(self.cfg)
File "D:\major assignment\edustudio\datatpl\common\general_datatpl.py", line 107, in from_cfg
return cls(cfg, **kwargs)
File "D:\major assignment\edustudio\datatpl\common\edu_datatpl.py", line 30, in init
super().init(cfg, df, df_train, df_valid, df_test, status)
File "D:\major assignment\edustudio\datatpl\common\general_datatpl.py", line 78, in init
self.process_data()
File "D:\major assignment\edustudio\datatpl\common\general_datatpl.py", line 143, in process_data
self.process_load_data_from_middata()
File "D:\major assignment\edustudio\datatpl\common\general_datatpl.py", line 126, in process_load_data_from_middata
kwargs = op.process(**kwargs)
File "D:\major assignment\edustudio\atom_op\mid2cache\single\M2C_CL4KT_OP.py", line 25, in process
kwargs = self.compute_cpt2difflevel(**kwargs)
File "D:\major assignment\edustudio\atom_op\mid2cache\single\M2C_CL4KT_OP.py", line 65, in compute_cpt2difflevel
for i, (c_list, r_list) in enumerate(zip(kwargs['df_train_folds'][0]['cpt_unfold_seq:token_seq'], kwargs['df_train_folds'][0]['label_seq:float_seq'])):
KeyError: 'df_train_folds'

经调试发现,cl4kt的kwargs并没有'df_train_folds'键值对。通过比对其他KT模型的数据模板,我发现是因为原子操作M2C_CL4KT_OP缺少了对'df_train_folds'的赋值。继续探索发现,在其他模型的数据模板中,对'df_train_folds'的赋值由M2C_RandomDataSplit4KT原子操作完成。于是我参照M2C_RandomDataSplit4KT对M2C_CL4KT_OP进行了修改。运行后,该报错消失,但又产生了新的报错,如下:
Traceback (most recent call last):
File "D:\major assignment\examples\single_model\run_cl4kt_demo.py", line 9, in
run_edustudio(
File "D:\major assignment\edustudio\quickstart\quickstart.py", line 72, in run_edustudio
raise e
File "D:\major assignment\edustudio\quickstart\quickstart.py", line 58, in run_edustudio
traintpl.start()
File "D:\major assignment\edustudio\traintpl\gd_traintpl.py", line 70, in start
self.model.build_cfg()
File "D:\major assignment\edustudio\model\KT\cl4kt.py", line 55, in build_cfg
self.n_item = self.datatpl_cfg['dt_info']['exer_count']
KeyError: 'exer_count'

经对比发现,datatpl_cfg['dt_info']['exer_count']的赋值是由M2C_RandomDataSplit4K的set_dt_info完成,而M2C_CL4KT_OP确实缺失了这一步。因此我对照M2C_RandomDataSplit4K的set_dt_info对M2C_CL4KT_OP的set_dt_info进行了相应修改。再次测试,代码成功运行。
修改后的完整M2C_CL4KT_OP代码如下:
from ..KT import M2C_BuildSeqInterFeats
import torch
import pandas as pd
import numpy as np
from edustudio.datatpl.utils import PadSeqUtil
from collections import defaultdict
from edustudio.datatpl.utils import SpliterUtil
from itertools import chain

class M2C_CL4KT_OP(M2C_BuildSeqInterFeats):
default_cfg = {
'seed': 2023,
'divide_by': 'stu',
'window_size': 200,
"divide_scale_list": [7,1,2],
'sequence_truncation': '', # option: recent、history

}
def __init__(self, m2c_cfg, n_folds, is_dataset_divided) -> None:
    super().__init__(m2c_cfg, n_folds, is_dataset_divided)

def process(self, **kwargs):
    kwargs = super().process(**kwargs)
    df_seq = kwargs['df_seq']
    df_train_seq = kwargs.get('df_train_seq', None)
    df_valid_seq = kwargs.get('df_validn_seq', None)
    df_test_seq = kwargs.get('df_test_seq', None)

    if not self.is_dataset_divided:
        assert df_train_seq is None and df_valid_seq is None and df_test_seq is None
        self.window_size = df_seq['exer_seq:token_seq'].shape[1]
        if self.m2c_cfg['divide_by'] == 'stu':
            if self.n_folds == 1:
                train_dict, valid_dict, test_dict = self._divide_data_df_by_stu_one_fold(df_seq)
                kwargs['df_train_folds'] = [train_dict]
                kwargs['df_valid_folds'] = [valid_dict]
                kwargs['df_test_folds'] = [test_dict]
            else:
                kwargs['df_train_folds'], kwargs['df_valid_folds'], kwargs[
                    'df_test_folds'] = self._divide_data_df_by_stu_multi_fold(df_seq)
        elif self.m2c_cfg['divide_by'] == 'time':
            raise NotImplementedError
        else:
            raise ValueError(f"unknown divide_by: {self.m2c_cfg['divide_by']}")
    else:
        assert df_train_seq is not None and df_test_seq is not None
        self.window_size = df_train_seq['exer_seq:token_seq'].shape[1]
        kwargs['df_train_folds'] = [df_train_seq]
        kwargs['df_valid_folds'] = [df_valid_seq]
        kwargs['df_test_folds'] = [df_test_seq]
    kwargs = self.compute_cpt2difflevel(**kwargs)
    return kwargs

def _dict_index_flag(self, df_seq:dict, flag: np.array):
    return {
        k: df_seq[k][flag] for k in df_seq
    }

def _divide_data_df_by_stu_one_fold(self, df_seq: dict):
    train_stu_id, valid_stu_id, test_stu_id = SpliterUtil.divide_data_df_one_fold(
        pd.DataFrame({"stu_id:token": np.unique(df_seq['stu_id:token'])}), seed=self.m2c_cfg['seed'], shuffle=True,
        divide_scale_list=self.m2c_cfg['divide_scale_list']
    )

    df_train_seq = self._dict_index_flag(df_seq, np.isin(df_seq['stu_id:token'], train_stu_id.to_numpy().flatten()))
    df_test_seq = self._dict_index_flag(df_seq, np.isin(df_seq['stu_id:token'], test_stu_id.to_numpy().flatten()))
    df_valid_seq = None
    if valid_stu_id is not None:
        df_valid_seq = self._dict_index_flag(df_seq,
                                             np.isin(df_seq['stu_id:token'], valid_stu_id.to_numpy().flatten()))

    return df_train_seq, df_test_seq, df_valid_seq

def _divide_data_df_by_stu_multi_fold(self, df_seq: pd.DataFrame):
    res = SpliterUtil.divide_data_df_multi_folds(
        pd.DataFrame({"stu_id:token": np.unique(df_seq['stu_id:token'])}), seed=self.m2c_cfg['seed'], shuffle=True, n_folds=self.n_folds
    )

    train_list,  test_list = [], []
    for (train_stu_id, test_stu_id) in zip(*res):
        df_train_seq = self._dict_index_flag(df_seq, np.isin(df_seq['stu_id:token'], train_stu_id.to_numpy().flatten()))
        df_test_seq = self._dict_index_flag(df_seq, np.isin(df_seq['stu_id:token'], test_stu_id.to_numpy().flatten()))
        train_list.append(df_train_seq)
        test_list.append(df_test_seq)

    return train_list, [], test_list

def compute_cpt2difflevel(self, **kwargs):
    cpt_correct = defaultdict(int)
    cpt_count = defaultdict(int)
    for i, (c_list, r_list) in enumerate(zip(kwargs['df_train_folds'][0]['cpt_unfold_seq:token_seq'], kwargs['df_train_folds'][0]['label_seq:float_seq'])):
        for c, r in zip(c_list[kwargs['df_train_folds'][0]['mask_seq:token_seq'][i] == 1], r_list[kwargs['df_train_folds'][0]['mask_seq:token_seq'][i] == 1]):
            cpt_correct[c] += r
            cpt_count[c] += 1
    cpt_diff = {c: cpt_correct[c] / float(cpt_count[c]) for c in cpt_correct}  # cpt difficult
    ordered_cpts = [item[0] for item in sorted(cpt_diff.items(), key=lambda x: x[1])]
    easier_cpts, harder_cpts = defaultdict(int), defaultdict(int)
    for index, cpt in enumerate(ordered_cpts):  
        if index == 0:
            easier_cpts[cpt] = ordered_cpts[index + 1]
            harder_cpts[cpt] = cpt
        elif index == len(ordered_cpts) - 1:
            easier_cpts[cpt] = cpt
            harder_cpts[cpt] = ordered_cpts[index - 1]
        else:
            easier_cpts[cpt] = ordered_cpts[index + 1]
            harder_cpts[cpt] = ordered_cpts[index - 1]

    kwargs['easier_cpts'] = easier_cpts
    kwargs['harder_cpts'] = harder_cpts

    return kwargs

def construct_df2dict(self, df: pd.DataFrame):
    if df is None: return None
 
    if self.m2c_cfg['sequence_truncation'] == 'recent':
        tmp_df = df[['stu_id:token', 'exer_id:token', 'cpt_unfold:token', 'label:float']].groupby('stu_id:token').agg(
            lambda x: list(x)[-self.m2c_cfg['window_size']:]
        ).reset_index()

        exer_seq, idx, mask_seq = PadSeqUtil.pad_sequence(  # mask_seq corresponding to attention_mask
            tmp_df['exer_id:token'].to_list(), return_idx=True, return_mask=True,
            maxlen=self.m2c_cfg['window_size'], padding='pre'
        )
        cpt_unfold_seq, _, _ = PadSeqUtil.pad_sequence(
            tmp_df['cpt_unfold:token'].to_list(),
            maxlen=self.m2c_cfg['window_size'], padding='pre'
        )
        label_seq, _, _ = PadSeqUtil.pad_sequence(
            tmp_df['label:float'].to_list(), dtype=np.float32,
            maxlen=self.m2c_cfg['window_size'], padding='pre'
        )
        
    else:
        tmp_df = df[['stu_id:token', 'exer_id:token', 'cpt_unfold:token', 'label:float']].groupby('stu_id:token').agg(
            lambda x: list(x)[:self.m2c_cfg['window_size']]
        ).reset_index()
        exer_seq, idx, mask_seq = PadSeqUtil.pad_sequence(  # mask_seq corresponding to attention_mask
            tmp_df['exer_id:token'].to_list(), return_idx=True, return_mask=True,
            maxlen=self.m2c_cfg['window_size'],
        )
        cpt_unfold_seq, _, _ = PadSeqUtil.pad_sequence(
            tmp_df['cpt_unfold:token'].to_list(),
            maxlen=self.m2c_cfg['window_size'],
        )
        label_seq, _, _ = PadSeqUtil.pad_sequence(
            tmp_df['label:float'].to_list(), dtype=np.float32,
            maxlen=self.m2c_cfg['window_size'],
        )


    stu_id = tmp_df['stu_id:token'].to_numpy()[idx]

    ret_dict = {
        'stu_id:token': stu_id,
        'exer_seq:token_seq': exer_seq,
        'cpt_unfold_seq:token_seq': cpt_unfold_seq,
        'label_seq:float_seq': label_seq,
        'mask_seq:token_seq': mask_seq
    }

    return ret_dict

def set_dt_info(self, dt_info, **kwargs):
    super().set_dt_info(dt_info, **kwargs)
    dt_info['train_easier_cpts'] = kwargs['easier_cpts']
    dt_info['train_harder_cpts'] = kwargs['easier_cpts']
    if not self.is_dataset_divided:
        if 'stu_id:token' in kwargs['df'].columns:
            dt_info['stu_count'] = int(kwargs['df']['stu_id:token'].max() + 1)
        if 'exer_id:token' in kwargs['df'].columns:
            dt_info['exer_count'] = int(kwargs['df']['exer_id:token'].max() + 1)
    else:
        stu_count = max(kwargs['df_train']['stu_id:token'].max() + 1, kwargs['df_test']['stu_id:token'].max() + 1)
        stu_count = max(kwargs['df_valid']['stu_id:token'].max() + 1, stu_count) if 'df_valid' in kwargs else stu_count

        exer_count = max(kwargs['df_train']['exer_id:token'].max() + 1, kwargs['df_test']['exer_id:token'].max() + 1)
        exer_count = max(kwargs['df_valid']['exer_id:token'].max() + 1, exer_count) if 'df_valid' in kwargs else exer_count

        dt_info['stu_count'] = stu_count
        dt_info['exer_count'] = exer_count

    if kwargs.get('df_exer', None) is not None:
        if 'cpt_seq:token_seq' in kwargs['df_exer']:
            dt_info['cpt_count'] = len(set(list(chain(*kwargs['df_exer']['cpt_seq:token_seq'].to_list()))))
@yujiebing
Copy link
Author

经同学提醒,发现前面这个问题已经解决了,但是不知道是不是同样的修改思路,期待您的回复!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant