텍스트 데이터를 불러온다.
def load_data(filepath):
df = pd.read_csv(filepath)
return df
torch의 Dataset 클래스로 객체화하기 전에 tokenizer로 토큰화해줘야 한다.
def construct_tokenized_dataset(dataset, tokenizer, max_length):
"""[뉴스제목 + [SEP] + 뉴스본문]형태로 토크나이징"""
# dataset = pd.DataFrame
concat_entity = []
for title, body in zip(dataset["newsTitle"],dataset["newsContent"]):
total = str(title) + "[SEP]" + str(body)
concat_entity.append(total)
tokenized_senetences = tokenizer(
concat_entity,
return_tensors = "pt",
padding = True,
truncation = True,
max_length = max_length,
add_special_tokens = True,
return_token_type_ids=False, # BERT 이후 모델(RoBERTa 등) 사용할때 On
)
return tokenized_senetences
torch.utils.data.Dataset
을 상속받아 커스텀 데이터셋 클래스를 정의
__getitem__
정의 : self.dataset이 tokenizer의 인코딩 결과값일 때 딕셔너리 타입임.
class news_dataset(torch.utils.data.Dataset):
"""dataframe을 torch dataset class로 변환"""
def __init__(self, news_dataset, labels):
self.dataset = news_dataset # tokenized_sequences
self.labels = labels
def __getitem__(self,idx):
item = {
key: val[idx].clone().detach() for key, val in self.dataset.items()
# idx 번째 input_ids, attention_mask만 item 딕셔너리에 복사하여 저장.
}
item['labels'] = torch.tensor(self.labels[idx])
'''
item = {
'input_ids' : tensor([토큰 인덱스]),
'attention_mask' : tensor([1,...0]),
'labels' : '라벨값'
}
'''
return item
def __len__(self):
return len(self.labels)
prepare_dataset function : load → split → tokenize → Dataset
def prepare_dataset(dataset_dir, tokenizer, max_length):
# load_data
train_df = load_data(..)
test_df = load_data(..)
# train-val-test split
train_split, val_split = train_test_split(train_df, test_size=0.2, random_state=,..)
# split labels
train_labels = train_split['label'].values
val_labels = val_split['label'].values
test_labels = test_split['label'].values
# tokenize
tokenized_train = construct_tokenized_data(train_split, tokenizer, max_length)
tokenized_val = construct_tokenized_data(val_split, tokenizer, max_length)
tokenized_test = construct_tokenized_data(test_df, tokenizer, max_length)
# Dataset
train_dataset = news_dataset(tokenized_train, train_labels)
val_dataset = news_dataset(tokenized_val, val_labels)
test_dataset = news_dataset(tokenized_test, test_labels)
return train_dataset, val_dataset, test_dataset
load tokenizer and model
from transformers import (
AutoTokenizer,
AutoConfig,
AutoModelForSequenceClassification,
)
def load_tokenizer_and_model_for_train():
"""학습(train)을 위한 사전학습(pretrained) 토크나이저와 모델을 huggingface에서 load"""
# load model and tokenizer
MODEL_NAME = args.model_name
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# setting model hyperparameter
model_config = AutoConfig.from_pretrained(MODEL_NAME) # model config 불러오기
model_config.num_labels = 2
print(model_config)
model = AutoModelForSequenceClassification.from_pretrained( # task에 맞는 AutoModel 로드
MODEL_NAME, config=model_config
)
print("--- Modeling Done ---")
return tokenizer , model
inference
def inference(model, tokenized_sent, device):
"""학습된(trained) 모델을 통해 결과를 추론하는 function"""
dataloader = DataLoader(tokenized_sent, batch_size=args.batch_size, shuffle=False)
model.eval()
output_pred = []
for i, data in enumerate(tqdm(dataloader)):
with torch.no_grad():
outputs = model(
input_ids=data["input_ids"].to(device), # 모델에 직접 token 인덱스 전달
attention_mask=data["attention_mask"].to(device), # attention mask 전달
)
logits = outputs[0]
logits = logits.detach().cpu().numpy() # np.ndarray (seq_length, num_labels)
result = np.argmax(logits, axis=-1)
output_pred.append(result)
return (np.concatenate(output_pred).tolist(),)