from transformers import AlbertTokenizer, AlbertModel, AlbertForMaskedLM,AlbertConfig, AlbertForPreTraining
import torch
config = AlbertConfig.from_json_file('model/config.json')
model = AlbertForPreTraining.from_pretrained('model/pytorch_model.bin', config=config)
tokenizer = AlbertTokenizer("model/sentencepiece.model")
import pandas as pd
df = pd.read_csv("data/pair.csv",names=["sentence1","sentence2"],header=None)
df = df.iloc[1:]
df["sop_label"] = 0
df
df1 = pd.DataFrame()
df1["sentence1"] = df["sentence2"]
df1["sentence2"] = df["sentence1"]
df1["sop_label"] = 1
df1.head()
df2 = pd.concat([df,df1])
df2 = df2.sample(frac=1)
df2.head()
df2 = df2.dropna()
df2.to_csv("sop_sen_pair.csv")
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
tokenizer([("<目的>XXの満開50日後の果実肥大調査。" ,"平年並み-やや大きい。"),("営農センターに在庫があるので、出荷の際とりにくるとのこと。","水稲肥料の一発かんたくんを6袋受注する。")]
,padding = "max_length",max_length = 100)
tokenizer.convert_ids_to_tokens(tokenizer("<目的>ももの満開50日後の果実肥大調査。" ,"平年並み-やや大きい。")["input_ids"])
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)