My sister usually plays tennis ( ) Saturdays.
- by 2. on 3. with 4. at
Bob ( ) five friends to his party.
- made 2. visited 3. invited 4. spoke
!pip install openpyxl
from transformers import pipeline
# fill-mask pipeline implements a task of filling a blank word
# https://huggingface.co/transformers/main_classes/pipelines.html#transformers.FillMaskPipeline
model = pipeline("fill-mask")
# Check how it works
res = model(f"HuggingFace is creating a {model.tokenizer.mask_token} that the community uses to solve NLP tasks.")
res
# notice that the matched words have a space in the beginning.
# this is to distinguish them with suffix words, which connects another word without a space
Downloading: 0%| | 0.00/480 [00:00<?, ?B/s]
Downloading: 0%| | 0.00/331M [00:00<?, ?B/s]
Downloading: 0%| | 0.00/899k [00:00<?, ?B/s]
Downloading: 0%| | 0.00/456k [00:00<?, ?B/s]
Downloading: 0%| | 0.00/1.36M [00:00<?, ?B/s]
[{'sequence': 'HuggingFace is creating a tool that the community uses to solve NLP tasks.',
'score': 0.17927570641040802,
'token': 3944,
'token_str': ' tool'},
{'sequence': 'HuggingFace is creating a framework that the community uses to solve NLP tasks.',
'score': 0.11349428445100784,
'token': 7208,
'token_str': ' framework'},
{'sequence': 'HuggingFace is creating a library that the community uses to solve NLP tasks.',
'score': 0.05243517830967903,
'token': 5560,
'token_str': ' library'},
{'sequence': 'HuggingFace is creating a database that the community uses to solve NLP tasks.',
'score': 0.034935519099235535,
'token': 8503,
'token_str': ' database'},
{'sequence': 'HuggingFace is creating a prototype that the community uses to solve NLP tasks.',
'score': 0.028602516278624535,
'token': 17715,
'token_str': ' prototype'}]
# Check the pipeline syntax
help(model.__call__)
# notice that the model accepts options:
# targets: the candidate words to fill the blank
# top_k : number of candidates to show
__call__(*args, targets=None, top_k: Union[int, NoneType] = None, **kwargs) method of transformers.pipelines.fill_mask.FillMaskPipeline instance
Fill the masked token in the text(s) given as inputs.
Args:
args (:obj:`str` or :obj:`List[str]`):
One or several texts (or one list of prompts) with masked tokens.
targets (:obj:`str` or :obj:`List[str]`, `optional`):
When passed, the model will return the scores for the passed token or tokens rather than the top k
predictions in the entire vocabulary. If the provided targets are not in the model vocab, they will be
tokenized and the first resulting token will be used (with a warning).
top_k (:obj:`int`, `optional`):
When passed, overrides the number of predictions to return.
Return:
A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys:
- **sequence** (:obj:`str`) -- The corresponding input with the mask token prediction.
- **score** (:obj:`float`) -- The corresponding probability.
- **token** (:obj:`int`) -- The predicted token id (to replace the masked one).
- **token** (:obj:`str`) -- The predicted token (to replace the masked one).
# Check which pretrained model we are using
model.model
from collections import namedtuple
# we define a problem as a named tuple below:
Problem = namedtuple("Problem", "text choices answer")
# Eiken grade 5
# source: https://www.eiken.or.jp/eiken/exam/grade_5/pdf/202101/2021-1-1ji-5kyu.pdf
eiken5 = [
Problem("A: What is your {}? B: Kazumi Suzuki.",
["hour", "club", "date", "name"], "name")
,Problem("I know Judy. She can {} French very well.",
["see", "drink", "speak", "open"], "speak")
,Problem("A: Are your baseball shoes in your room, Mike? B: No, Mom. They're in my {} at school.",
["window", "shop", "locker", "door"], "locker")
,Problem("Mysister usually plays tennis {} Saturdays.",
["by", "on", "with", "at"], "on")
,Problem("My mother likes {}. She has many pretty ones in the garden.",
["sports", "movies", "schools", "flowers"], "flowers")
,Problem("Let's begin today's class. Open your textbooks to {} 22.",
["chalk", "ground", "page", "minute"], "page")
,Problem("Today is Wednesday. Tomorrow is {}.",
["Monday", "Tuesday", "Thursday", "Friday"], "Thursday")
,Problem("I usually read magazines {} home.",
["of", "on", "with", "at"], "at")
,Problem("A: It's ten o'clock, Jimmy. {} to bed. B: All right, Mom.",
["Go", "Sleep", "Do", "Sit"], "Go")
,Problem("A: Do you live {} Tokyo? B: Yes. It's a big city.",
["after", "with", "on", "in"], "in")
]
# These are grade-5 questions (Grade 1 is the highest)
eiken5
# Masked Language Model without choices
# we consider the BERT model's guess "correct" if the correct answer is in the top 5 candidates
import pandas as pd
out = pd.DataFrame() # we will add outcomes in the dataframe
def solve_without_choices(problems, top_k=5):
inputs = [p.text.format(model.tokenizer.mask_token) for p in problems]
res = model(inputs, top_k=top_k)
out = []
for p, r in zip(problems, res):
# suggested answers and the scores
suggested = [s["token_str"].strip() for s in r]
scores = [s["score"] for s in r]
suggested_scores = ",".join("%s(%.3f)" % (w,s) for w, s in zip(suggested, scores))
# location of answer
if p.answer in suggested:
position = suggested.index(p.answer) + 1
else:
position = -1
out.append((p.text, suggested_scores, position))
out = pd.DataFrame(out, columns=["problem", "scores", "answer_position"])
out["correct"] = (out["answer_position"] > 0)
return out
o = solve_without_choices(eiken5)
o["grade"] = "5"
o["method"] = "fill-mask-no-target"
out = pd.concat((out, o))
o
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
# Masked Language Model with candidate words
# In this case, the model's guess is considered correct if the correct one is the first guess.
def solve_with_choices(problems):
out = []
for p in problems:
text = p.text.format(model.tokenizer.mask_token)
targets = [" " + c for c in p.choices] # without this, words seems to be treated as suffix
res = model(text, targets=targets)
words = [s["token_str"].strip() for s in res]
scores = [s["score"] for s in res]
suggested_scores = ",".join("%s(%.3f)" % (w,s) for w, s in zip(words, scores))
# location of answer
if p.answer in words:
position = words.index(p.answer) + 1
else:
position = -1
out.append((p.text, suggested_scores, position))
out = pd.DataFrame(out, columns=["problem", "scores", "answer_position"])
out["correct"] = (out.answer_position == 1)
return out
o = solve_with_choices(eiken5)
o["grade"] = "5"
o["method"] = "fill-mask-with-targets"
out = pd.concat((out, o))
o
# The only failure by BERT is about the day of week:
# Today is Wednesday. Tomorrow is ( ).
#
# The model guessed Friday rather than Thursday. Close!
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
# Apply the methods to other grades
# Eiken grade 4
# source: https://www.eiken.or.jp/eiken/exam/grade_4/pdf/202101/2021-1-1ji-4kyu.pdf
eiken4 = [
Problem("My father is a {} of a sports club. He plays tennis there every Wednesday night.",
["festival", "picnic", "member", "group"], "member")
,Problem("Mr. Clark told us many intesting {} about his trip to India.",
["pictures", "books", "stories", "magazines"], "stories")
,Problem("It's snowing a lot today, so please drive {}.",
["slowly", "freely", "coldly", "busily"], "slowly")
,Problem("In spring, Jane likes to walk in her grandmother's {}. She enjoys looking at the beautiful flowers there.",
["stone", "sky", "garden", "wall"], "garden")
,Problem("Many girls in my class have {} hair.",
["late", "slow", "short", "busy"], "short")
,Problem("A: Do you live in a city? B: No. I live in a small {}",
["hobby", "ticket", "town", "holiday"], "town")
,Problem("I {} Nancy's notebook. It was on Mary's desk",
["stayed", "found", "stopped", "went"], "found")
,Problem("Dennis went to Japan for a year in August. He was sad when he {} goodbye to his family",
["ended", "hoped", "told", "said"], "said")
,Problem("Jeff left the party at 8:00. He wanted to {} home early and go to bed.",
["meet", "put", "send", "get"], "get")
,Problem("Mom's lemon cake is not as good {} her chocolate cake.",
["to", "of", "as", "by"], "as")
]
o = solve_without_choices(eiken4)
display(o)
o["grade"] = "4"
o["method"] = "fill-mask-no-target"
out = pd.concat((out, o))
o = solve_with_choices(eiken4)
display(o)
o["grade"] = "4"
o["method"] = "fill-mask-with-targets"
out = pd.concat((out, o))
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
The specified target token ` coldly` does not exist in the model vocabulary. Replacing with `Ġcold`.
The specified target token ` busily` does not exist in the model vocabulary. Replacing with `Ġbus`.
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
# Eiken grade 3
# source: https://www.eiken.or.jp/eiken/exam/grade_3/pdf/202101/2021-1-1ji-3kyu.pdf
eiken3 = [
Problem("A: How do you make that potato dish? B: First, you {} the potatoes, and then cut them in half and put butter on them.",
["boil", "care", "hurt", "eat"], "boil")
,Problem("Last summer, Hiroshi's family traveled around Japan. This year they went to go {}.",
["abroad", "inside", "other", "similar"], "abroad")
,Problem("Bob {} five friends to his party",
["made", "visited", "invited", "spoke"], "invited")
,Problem("A: John, you should go to bed soon. If you stay up too late, you'll {} and be late for school. B: OK, Mom.",
["graduate", "promise", "return", "oversleep"], "oversleep")
,Problem("A: Did you buy your father something special for his birthday? B: Yes. He loves to cook, so I got him a new {}.",
["apron", "ring", "contact", "field"], "apron")
,Problem("I bought a new T-shirt for my brother, but I bought the wong size. It was too {} for him.",
["heavy", "clear", "tight", "bright"], "tight")
,Problem("Sarah saw some flowers by the road while she was taking a walk. She {} a few and took them home.",
["spent", "wished", "picked", "guessed"], "picked")
,Problem("Jenny saw her grandparents {} the first time in years. She missed them very much.",
["for", "from", "out", "over"], "for")
,Problem("A: I told my mother that I would be home by 7:00. I don't want to {} my promise, so I have to go now. B: OK.",
["pass", "sell", "break", "lend"], "break")
,Problem("A: Don'y say anything to Dad about the surprise party! B: Don't worry. He won't find {} about it from me.",
["within", "through", "out", "near"], "out")
]
o = solve_without_choices(eiken3)
display(o)
o["grade"] = "3"
o["method"] = "fill-mask-no-target"
out = pd.concat((out, o))
o = solve_with_choices(eiken3)
display(o)
o["grade"] = "3"
o["method"] = "fill-mask-with-targets"
out = pd.concat((out, o))
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
The specified target token ` oversleep` does not exist in the model vocabulary. Replacing with `Ġovers`.
The specified target token ` apron` does not exist in the model vocabulary. Replacing with `Ġa`.
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
# Eiken grade pre2
# source: https://www.eiken.or.jp/eiken/exam/grade_p2/pdf/202101/2021-1-1ji-p2kyu.pdf
eikenp2 = [
Problem("Jamie visited several {} buildings when he went to Rome. Some of them were more than 2,000 years old.",
["ancient", "exact", "responsible", "unable"], "ancient")
,Problem("Sally's French teacher told her to read an article in a French newspaper and {} it into English.",
["guide", "throw", "control", "translate"], "translate")
,Problem("Henry likes living in the city because there are so many things to do there. But he also loves nature, so sometimes he goes for a drive in the {}.",
["decision", "experiment", "countryside", "image"], "countryside")
,Problem("A: Is it true that the things in this store only cost 100 yen? B: Yes, but you will also need to pay {}, so they actually cost a little more.",
["tax", "data", "total", "waste"], "tax")
,Problem("When the bust was an hour late, one man shouted {} at the driver. He said that he had missed an important meeting.",
["partly", "angrily", "secretly", "tightly"], "angrily")
,Problem("Firefighters have to {} people from buildings that are on fire. To do this, they must be strong and healthy.",
["weigh", "produce", "stamp", "rescue"], "rescue")
,Problem("John loves the singer Ann May, and he cannot wait until her new CD is {} next week. He has been waiting for it since here last CD came out two years ago.",
["released", "trapped", "divided", "invented"], "released")
,Problem("The news that Ms. Kelly, the art teacher, was going to get married {} through the school very quickly. By lunchtime, almost all the students knew about it.",
["spread", "served", "stretched", "stood"], "spread")
,Problem("A: I'm really nervous about acting in the play next week. B: I know you're worried now, but you'll feel fine as soon a syou get on the {}.",
["stage", "field", "court", "screen"], "stage")
,Problem("Before Diane attended Professor Miller's {} at the university, she was not intested in Chinese art. However, now, she wants to learn more about it.",
["comment", "shipment", "lecture", "furniture"], "lecture")
]
o = solve_without_choices(eikenp2)
display(o)
o["grade"] = "pre2"
o["method"] = "fill-mask-no-target"
out = pd.concat((out, o))
o = solve_with_choices(eikenp2)
display(o)
o["grade"] = "pre2"
o["method"] = "fill-mask-with-targets"
out = pd.concat((out, o))
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
# Eiken grade 2
# source: https://www.eiken.or.jp/eiken/exam/grade_2/pdf/202101/2021-1-1ji-2kyu.pdf
eiken2 = [
Problem(("At first, the marketing department and the sales department were on the project together. "
"But people in the sales department were too busy, so now the project is being run {} by the marketing department."),
["needlessly", "entirely", "scientifically", "violently"], "entirely")
,Problem("Experts at the art gallery discovered that one of their paitings, which they had thought was a {} Picasso, was actually just a copy.",
["genuine", "severe", "logical", "portable"], "genuine")
,Problem("The musician Jimmy Baker had a lot of {} when he was a child. His family was very poor before he became a rich and famous rock star.",
["permission", "membership", "concentration", "hardship"], "hardship")
,Problem("Mother Teresa helped many sick people and gave food to many hungry children in India. She was known as a person who cared about {}.",
["generation", "gravity", "hesitation", "humanity"], "humanity")
,Problem("As Liam waled down the dark street, he began to feel afraid. He had the {} that someone was watching him.",
["feature", "translation", "sensation", "property"], "sensation")
,Problem("Risa buys water that comes from a mountain stream. She says that drinking it is good because it has many {} that her body needs.",
["campaigns", "operations", "illustrations", "minerals"], "minerals")
,Problem("The lifeguard ran into the ocean to help a young girl who looked like she was {} in the big waves",
["proposing", "converting", "drowning", "exporting"], "drowning")
,Problem("Yesterday was a hot day at the zoo, so Heather bought an ice cream. It melted so quickly that she could not help {} some on her dress.",
["arguing", "spilling", "convincing", "maintaining"], "spilling")
,Problem("In the past, sailors had to use the stars to {} when they were on an ocean. These days, ships have modern equipment that shows sailors which way to go.",
["satisfy", "respect", "permit", "navigate"], "navigate")
,Problem("Daisuke's grandmother eats a lot of vegetables, drinks green tea, and goes for a long walk every evening to {} her health.",
["interpret", "replace", "preserve", "betray"], "preserve")
]
o = solve_without_choices(eiken2)
display(o)
o["grade"] = "2"
o["method"] = "fill-mask-no-target"
out = pd.concat((out, o))
o = solve_with_choices(eiken2)
display(o)
o["grade"] = "2"
o["method"] = "fill-mask-with-targets"
out = pd.concat((out, o))
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
The specified target token ` needlessly` does not exist in the model vocabulary. Replacing with `Ġneed`.
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
# Eiken grade pre1
# source: https://www.eiken.or.jp/eiken/exam/grade_p1/pdf/202101/2021-1-1ji-p1kyu.pdf
eikenp1 = [
Problem(("A: Thanks for showing me the outline of your sales presentation. It's good, but it's a bit {} in some places. "
"B: I guess I do repeat some information too much. I'll try to take some of it out."),
["decisive", "subjective", "redundant", "distinct"], "redundant")
,Problem("Lisa went to the interview even though she thought there was a low {} of her getting the job. As she expected, she was not hired.",
["restoration", "credibility", "contention", "probability"], "probability")
,Problem("It is sadly {} that, in developing counties, many of the farmers who grow nutritous crops for export do no not have enough food to feed their own families.",
["indefinite", "ironic", "restless", "superficial"], "ironic")
,Problem("The explosion at the chemical factory {} great damage on the local environment. It will take years for wildlife to fully recover in the region.",
["inflicted", "enhanced", "vanished", "perceived"], "inflicted")
,Problem("Some say the best way to overcome a {} is to expose oneself to what one fears. For example, people who are afraid of mice should try holding one.",
["temptation", "barricade", "phobia", "famine"], "phobia")
,Problem("English classes at the university were required, but students were {} from them if they could prove they had advanced ability in the language.",
["exempted", "prosecuted", "commanded", "qualified"], "exempted")
,Problem("E-mail and text messaging have {} the way people write. Many people shorten words and ignore traditional rules of grammar.",
["transformed", "officiated", "synthesized", "disarmed"], "transformed")
,Problem(("Some analysts think the new treaty on CO2 emissions is a {} in the fight against global warming. "
'"This is the most important environmental treaty ever signed," said one.'),
["milestone", "vigor", "backlog", "confession"], "milestone")
,Problem("Lying on the sunny beach with her husband on their vacation, Roberta felt {} happy. She had never been so content.",
["barely", "profoundly", "improperly", "harshly"], "profoundly")
,Problem("Nadine spends an hour thoroughly cleaning her apartment evey day, so the entire place is {}.",
["spotless", "minute", "rugged", "impartial"], "spotless")
]
o = solve_without_choices(eikenp1)
display(o)
o["grade"] = "pre1"
o["method"] = "fill-mask-no-target"
out = pd.concat((out, o))
o = solve_with_choices(eikenp1)
display(o)
o["grade"] = "pre1"
o["method"] = "fill-mask-with-targets"
out = pd.concat((out, o))
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
The specified target token ` barricade` does not exist in the model vocabulary. Replacing with `Ġbarric`.
The specified target token ` phobia` does not exist in the model vocabulary. Replacing with `Ġph`.
The specified target token ` officiated` does not exist in the model vocabulary. Replacing with `Ġoffic`.
The specified target token ` synthesized` does not exist in the model vocabulary. Replacing with `Ġsynthes`.
The specified target token ` disarmed` does not exist in the model vocabulary. Replacing with `Ġdis`.
The specified target token ` vigor` does not exist in the model vocabulary. Replacing with `Ġvig`.
The specified target token ` spotless` does not exist in the model vocabulary. Replacing with `Ġspot`.
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
# Eiken grade 1
# source: https://www.eiken.or.jp/eiken/exam/grade_1/pdf/202101/2021-1-1ji-1kyu.pdf
eiken1 = [
Problem("Cell phones have become a permanent {} in modern society. Most perople could not imagine living without one.",
["clasp", "stint", "fixture", "rupture"], "fixture")
,Problem("Colin did not have enough money to pay for the car all at onece, so he paid it off in {} of $800 a month for two years.",
["dispositions", "installments", "enactments", "speculations"], "installments")
,Problem("When she asked her boss for a raise, Melanie's {} tone of voice made it obvious how nervous she was.",
["garish", "jovial", "pompous", "diffident"], "diffident")
,Problem("The religious sect established a {} in a rural area where its followers could live together and share everything. No private property was allowed.",
["dirge", "prelude", "repository", "commune"], "commune")
,Problem("The famous reporter was fired for {} another journalist's work. His article was almost exactly the same as that of the other journalist.",
["alleviating", "plagiarizing", "inoculating", "beleaguering"], "plagiarizing")
,Problem("Now that the local steel factory has closed down, the streets of the once-busy town are lined with {} businesses. Most owners have abandoned their stores.",
["rhetorical", "volatile", "defunct", "aspiring"], "defunct")
,Problem("The ambassador's failure to attend the ceremony held in honor of the king was considered an {} by his host nation and made already bad relations worse.",
["elucidation", "affront", "impasse", "ultimatum"], "affront")
,Problem("US border guards managed to {} the escaped prisoner as he tried to cross into Canada. He was returned to jail immediately.",
["apprehend", "pillage", "exalt", "acclimate"], "apprehend")
,Problem("Anthony enjoyed his first day at his new job. The atmosphere was {}, and his colleagues did their best to make him feel welcome.",
["congenial", "delirious", "measly", "implausible"], "congenial")
,Problem(("A: I just learned I've been {} to second violin in the school orchestra. I knew I should've practiced more."
"B: Well, if you work hard, I'm sure you can get your previous position back."),
["relegated", "jeopardized", "reiterated", "stowed"], "relegated")
]
o = solve_without_choices(eiken1)
display(o)
o["grade"] = "1"
o["method"] = "fill-mask-no-target"
out = pd.concat((out, o))
o = solve_with_choices(eiken1)
display(o)
o["grade"] = "1"
o["method"] = "fill-mask-with-targets"
out = pd.concat((out, o))
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
The specified target token ` dispositions` does not exist in the model vocabulary. Replacing with `Ġdispos`.
The specified target token ` enactments` does not exist in the model vocabulary. Replacing with `Ġenact`.
The specified target token ` speculations` does not exist in the model vocabulary. Replacing with `Ġspec`.
The specified target token ` garish` does not exist in the model vocabulary. Replacing with `Ġgar`.
The specified target token ` jovial` does not exist in the model vocabulary. Replacing with `Ġj`.
The specified target token ` pompous` does not exist in the model vocabulary. Replacing with `Ġpomp`.
The specified target token ` diffident` does not exist in the model vocabulary. Replacing with `Ġdiff`.
The specified target token ` dirge` does not exist in the model vocabulary. Replacing with `Ġdir`.
The specified target token ` prelude` does not exist in the model vocabulary. Replacing with `Ġpre`.
The specified target token ` commune` does not exist in the model vocabulary. Replacing with `Ġcommun`.
The specified target token ` alleviating` does not exist in the model vocabulary. Replacing with `Ġallev`.
The specified target token ` plagiarizing` does not exist in the model vocabulary. Replacing with `Ġplagiar`.
The specified target token ` inoculating` does not exist in the model vocabulary. Replacing with `Ġinoc`.
The specified target token ` beleaguering` does not exist in the model vocabulary. Replacing with `Ġbe`.
The specified target token ` elucidation` does not exist in the model vocabulary. Replacing with `Ġeluc`.
The specified target token ` affront` does not exist in the model vocabulary. Replacing with `Ġaff`.
The specified target token ` impasse` does not exist in the model vocabulary. Replacing with `Ġimp`.
The specified target token ` ultimatum` does not exist in the model vocabulary. Replacing with `Ġult`.
The specified target token ` pillage` does not exist in the model vocabulary. Replacing with `Ġpill`.
The specified target token ` exalt` does not exist in the model vocabulary. Replacing with `Ġex`.
The specified target token ` acclimate` does not exist in the model vocabulary. Replacing with `Ġacc`.
The specified target token ` congenial` does not exist in the model vocabulary. Replacing with `Ġcongen`.
The specified target token ` delirious` does not exist in the model vocabulary. Replacing with `Ġdel`.
The specified target token ` measly` does not exist in the model vocabulary. Replacing with `Ġmeas`.
The specified target token ` implausible` does not exist in the model vocabulary. Replacing with `Ġimpl`.
The specified target token ` jeopardized` does not exist in the model vocabulary. Replacing with `Ġjeopard`.
The specified target token ` stowed` does not exist in the model vocabulary. Replacing with `Ġst`.
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
# As the grade gets higher, we are more likely to encounter "unknown" words (words not included in the vocabulary)
# and failed to make a guess for them.
#
# To overcome this, we will next employs a language model that calculates the sentence perplexity scores.
# Reference: https://huggingface.co/transformers/perplexity.html
# https://discuss.huggingface.co/t/gpt-2-perplexity-score-normalized-on-sentence-lenght/5205
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
device = "cuda"
model_id = "gpt2-large"
model2 = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
Downloading: 0%| | 0.00/3.25G [00:00<?, ?B/s]
Downloading: 0%| | 0.00/1.04M [00:00<?, ?B/s]
Downloading: 0%| | 0.00/456k [00:00<?, ?B/s]
Downloading: 0%| | 0.00/1.36M [00:00<?, ?B/s]
def solve_with_choices2(problems):
out = []
for p in problems:
texts = [p.text.format(c) for c in p.choices]
res = []
for t in texts:
tmp = tokenizer(t, return_tensors='pt')
input_ids = tmp.input_ids.to(device)
with torch.no_grad():
res.append(model2(input_ids, labels=input_ids)[0].item())
res = list(zip(p.choices, res))
res.sort(key=lambda a: a[1])
scores = ",".join("%s(%.3f)" % a for a in res)
answer_position = [s[0] for s in res].index(p.answer) + 1
out.append((p.text, scores, answer_position))
out = pd.DataFrame(out, columns=["problem", "scores", "answer_position"])
out["correct"] = (out.answer_position==1)
return out
o = solve_with_choices2(eiken5)
display(o)
o["grade"] = "5"
o["method"] = "perplexity"
out = pd.concat((out, o))
o = solve_with_choices2(eiken4)
display(o)
o["grade"] = "4"
o["method"] = "perplexity"
out = pd.concat((out, o))
o = solve_with_choices2(eiken3)
display(o)
o["grade"] = "3"
o["method"] = "perplexity"
out = pd.concat((out, o))
o = solve_with_choices2(eikenp2)
display(o)
o["grade"] = "pre2"
o["method"] = "perplexity"
out = pd.concat((out, o))
o = solve_with_choices2(eiken2)
display(o)
o["grade"] = "2"
o["method"] = "perplexity"
out = pd.concat((out, o))
o = solve_with_choices2(eikenp1)
display(o)
o["grade"] = "pre1"
o["method"] = "perplexity"
out = pd.concat((out, o))
o = solve_with_choices2(eiken1)
display(o)
o["grade"] = "1"
o["method"] = "perplexity"
out = pd.concat((out, o))
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
# The model got a full-mark except just one:
#
# When she asked her boss for a raise, Melanie's {} tone of voice made it obvious how nervous she was.",
# "garish", "jovial", "pompous", "diffident"
#
# The correct answer is "diffident", meaning lack of self-confidence. This matches the fact that she was nervous.
# The BERT's guess is "jovial", meaning friendly. This many naturally connect to "tone of voice", but misses the following context.
# compile output into human-friendly excel file
out2 = None
for method in out.method.unique():
tmp = out[out.method==method].copy().reset_index(drop=True)
tmp["problem_number"] = list(range(1,11)) * int(len(tmp)/10)
tmp = tmp[["grade", "problem_number", "problem", "scores", "correct"]]
tmp = tmp.rename(columns={"scores": "scores({})".format(method),
"correct": "correct({})".format(method)})
if out2 is None:
out2 = tmp
else:
out2 = pd.merge(out2, tmp)
display(out2.head())
out3 = out.pivot_table(index="grade", columns="method", values="correct", aggfunc=sum)
out3["grade2"] = out3.index.tolist()
out3.grade2 = out3.grade2.replace("pre2", 2.5).replace("pre1", 1.5).astype(float)
out3 = out3.sort_values("grade2").drop(columns="grade2")
display(out3)
with pd.ExcelWriter("eiken-bert.xlsx") as f:
out.to_excel(f, "full-result", index=False)
out2.to_excel(f, "result-wideformat", index=False)
out3.to_excel(f, "score-summary")
out.to_csv("eiken-bert_full-result.csv", index=False)
out2.to_csv("eiken-bert_result-wideformat.csv", index=False)
out3.to_csv("eiken-bert_score-summary.csv")
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}