Identifying and fixing a ValueError in NLI model fine-tuning process.
Loading data from Talking to Chatbots Dataset reddgr/talking-to-chatbots-chats …
π # nli_finetuning_utils.py
import pandas as pd
import random
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch import cuda
class NLIModelFineTuner:
def __init__(self, dataset, labels, model, tokenizer):
self.dataset = dataset
self.labels = labels
self.model = model
self.tokenizer = tokenizer
self.train_dataset = None
def tokenize_and_format_dataset(self, template=”This example is {}.”, num_contradictions=2, max_length=128):
“””
Tokenizes and formats the dataset for fine-tuning.
Args:
template (str): Template for entailment and contradiction examples.
num_contradictions (int): Number of contradiction examples per input.
max_length (int): Maximum sequence length for padding/truncation.
Returns:
formatted_dataset: Hugging Face Dataset with tokenized examples.
“””
# Convert the dataset to a Pandas DataFrame
df = self.dataset.to_pandas()
input_ids = []
attention_masks = []
labels = []
input_sentences = [] # To store original untruncated sentences
eos_token_id = self.tokenizer.eos_token_id # Ensure consistency for eos tokens
for index, row in df.iterrows():
text = row[“text”]
label = row[“class”]
# Construct original entailment sentence
entailment_text = f”
input_sentences.append(entailment_text)
# Tokenize entailment example
entailment_encoding = self.tokenizer(
entailment_text,
max_length=max_length – 1, # Leave space for
padding=”max_length”,
truncation=True,
return_tensors=”pt”,
add_special_tokens=True,
)
# Append eos token if missing
#if entailment_encoding[“input_ids”][0, -1] != eos_token_id:
# entailment_encoding[“input_ids”][0, -1] = eos_token_id
input_ids.append(entailment_encoding[“input_ids”].squeeze(0))
attention_masks.append(entailment_encoding[“attention_mask”].squeeze(0))
labels.append(2) # Entailment label
# Construct and tokenize contradiction examples
possible_contradictions = [x for x in self.labels if x != label]
selected_contradictions = random.sample(possible_contradictions, num_contradictions)
for contradiction_label in selected_contradictions:
contradiction_text = f”
input_sentences.append(contradiction_text)
contradiction_encoding = self.tokenizer(
contradiction_text,
max_length=max_length – 1, # Leave space for
padding=”max_length”,
truncation=True,
return_tensors=”pt”,
add_special_tokens=True,
)
# Append eos token if missing
if contradiction_encoding[“input_ids”][0, -1] != eos_token_id:
contradiction_encoding[“input_ids”][0, -1] = eos_token_id
input_ids.append(contradiction_encoding[“input_ids”].squeeze(0))
attention_masks.append(contradiction_encoding[“attention_mask”].squeeze(0))
labels.append(0) # Contradiction label
# Combine into a Hugging Face Dataset
formatted_dataset = Dataset.from_dict({
“input_ids”: [seq.tolist() for seq in input_ids],
“attention_mask”: [mask.tolist() for mask in attention_masks],
“labels”: labels,
“input_sentence”: input_sentences, # Keep original sentences
})
self.train_dataset = formatted_dataset
return formatted_dataset
def fine_tune(self, output_dir=”./results”, epochs=3, batch_size=8, learning_rate=5e-5):
“””
Fine-tune the NLI model.
Args:
output_dir (str): Directory to save model checkpoints.
epochs (int): Number of training epochs.
batch_size (int): Batch size for training.
learning_rate (float): Learning rate for the optimizer.
“””
device = “cuda:0” if cuda.is_available() else “cpu”
print(f”Using device: {device}”)
self.model = self.model.to(device)
# Verify that the dataset is PyTorch-compatible
self.train_dataset.set_format(type=”torch”, columns=[“input_ids”, “attention_mask”, “labels”])
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
save_steps=10,
save_total_limit=2,
learning_rate=learning_rate,
logging_dir=f”{output_dir}/logs”,
evaluation_strategy=”epoch”,
#report_to=”none”,
)
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=self.train_dataset
)
print(“Fine-tuning in progress…”)
trainer.train()
print(f”Fine-tuning complete. Model saved to {output_dir}.”)
return trainer
def OLD_tokenize_and_format_dataset(self, template=”This example is {}.”, num_contradictions=2):
# Convert the dataset to a Pandas DataFrame
df = self.dataset.to_pandas()
entailment_input_ids = []
contradiction_input_ids = []
attention_masks = []
labels_list = []
input_sentences = []
for index, row in df.iterrows():
text = row[“text”]
label = row[“class”]
# Encode the entailment example
encoded_text = self.tokenizer.encode(f”
entailment_ids = encoded_text + self.tokenizer.encode(
f” {template.format(label)}”, add_special_tokens=False
)
# Add entailment example
entailment_input_ids.append(entailment_ids)
attention_masks.append([1] * len(entailment_ids))
labels_list.append(2) # Entailment label
input_sentences.append(f”
# Create contradiction examples
possible_contradictions = [x for x in self.labels if x != label]
selected_contradictions = random.sample(possible_contradictions, num_contradictions)
for contradiction_label in selected_contradictions:
contradiction_ids = encoded_text + self.tokenizer.encode(
f” {template.format(contradiction_label)}”, add_special_tokens=False
)
contradiction_input_ids.append(contradiction_ids)
attention_masks.append([1] * len(contradiction_ids))
labels_list.append(0) # Contradiction label
input_sentences.append(f”
# Combine entailment and contradiction data
transformed_df = pd.DataFrame({
“input_ids”: entailment_input_ids + contradiction_input_ids,
“attention_mask”: attention_masks,
“labels”: labels_list,
“input_sentence”: input_sentences,
})
# Convert the DataFrame back to a Hugging Face Dataset
formatted_dataset = Dataset.from_pandas(transformed_df)
self.train_dataset = formatted_dataset
return formatted_dataset
# code that uses the module:
import transformers
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import random
from datasets import load_dataset, Dataset
import pandas as pd
import nli_finetuning_utils
import sys
import torch
# Load dataset and initialize parameters
dataset = load_dataset(“fancyzhx/ag_news”, split=”test”)
use_sampled_dataset=False
if use_sampled_dataset:
dataset = dataset.shuffle(seed=42).select(range(20))
dataset = dataset.select(range(20))
labels = [“World”, “Sports”, “Business”, “Sci/Tech”]
dataset = dataset.map(lambda x: {“class”: labels[x[“label”]]}, remove_columns=[“label”])
nli_tokenizer = AutoTokenizer.from_pretrained(‘facebook/bart-large-mnli’, clean_up_tokenization_spaces=True)
nli_model = AutoModelForSequenceClassification.from_pretrained(‘facebook/bart-large-mnli’, clean_up_tokenization_spaces=True)
print(f”Loaded NLI model with head:\n{nli_model.classification_head.out_proj}\n{nli_model.config.id2label}”)
# Instantiate the NLIModelFineTuner class
fine_tuner = nli_finetuning_utils.NLIModelFineTuner(dataset, labels, nli_model, nli_tokenizer)
# Tokenize and format the dataset
num_contradictions = 3
template = “This example is a {} prompt.”
train_dataset = fine_tuner.tokenize_and_format_dataset(template=template, num_contradictions=num_contradictions, max_length=128)
fine_tuner.fine_tune(output_dir=”./models”, epochs=3, batch_size=8, learning_rate=0.0001)
# ERROR RAISED
—————————————————————————
ValueError Traceback (most recent call last)
Cell In[5], line 1
—-> 1 fine_tuner.fine_tune(output_dir=”./models”, epochs=3, batch_size=8, learning_rate=0.0001)
File [PATH]\Documents\python_scripts\nli_finetuning\nli_finetuning_utils.py:132, in NLIModelFineTuner.fine_tune(self, output_dir, epochs, batch_size, learning_rate)
125 trainer = Trainer(
126 model=self.model,
127 args=training_args,
128 train_dataset=self.train_dataset
129 )
131 print(“Fine-tuning in progress…”)
–> 132 trainer.train()
133 print(f”Fine-tuning complete. Model saved to {output_dir}.”)
134 return trainer
File [PATH]\anaconda3\Lib\site-packages\transformers\trainer.py:1938, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1936 hf_hub_utils.enable_progress_bars()
1937 else:
-> 1938 return inner_training_loop(
1939 args=args,
1940 resume_from_checkpoint=resume_from_checkpoint,
1941 trial=trial,
1942 ignore_keys_for_eval=ignore_keys_for_eval,
1943 )
…
1824 :, -1, :
1825 ]
1826 logits = self.classification_head(sentence_representation)
ValueError: All examples must have the same number of
TRY TO SPOT THE ERROR AND WAY TO FIX
π€ The error you’re encountering arises because the BART model you’re fine-tuning expects all input sequences in the batch to have the same number of `` tokens. In your current code, the number of ` ` tokens varies across different sequences, causing the `ValueError: All examples must have the same number of tokens`.
**Here’s why the error occurs:**
– **Inconsistent Handling of `` Tokens**: In your `tokenize_and_format_dataset` method, you have commented out the code that appends an ` ` token to the entailment examples: python # Append eos token if missing # if entailment_encoding["input_ids"][0, -1] != eos_token_id: # entailment_encoding["input_ids"][0, -1] = eos_token_id
However, for contradiction examples, you have this code active:python # Append eos token if missing if contradiction_encoding["input_ids"][0, -1] != eos_token_id: contradiction_encoding["input_ids"][0, -1] = eos_token_id
This inconsistency leads to some sequences having an `` token at the end while others do not.
– **Manual Manipulation of `` Tokens**: Manually appending ` ` tokens and adjusting `max_length` by subtracting one (`max_length – 1`) can lead to unintended sequence lengths and token placements, especially when padding and truncation are involved.
**How to Fix the Error:**
1. **Uncomment and Correct the `` Token Handling for Entailment Examples**:
Ensure that the `` token is appended consistently across all examples. Uncomment and adjust the code for entailment examples: python # Append eos token if missing if entailment_encoding["input_ids"][0, -1] != eos_token_id: entailment_encoding["input_ids"][0, -1] = eos_token_id
2. **Avoid Manual Manipulation of `` Tokens**:
Let the tokenizer handle the special tokens automatically by setting `add_special_tokens=True`. Remove any code that manually appends or manipulates `` tokens.
3. **Set `max_length` Appropriately**:
Do not subtract one from `max_length`. Set `max_length` to the desired sequence length and let the tokenizer handle truncation and padding.
4. **Verify Tokenizer Behavior**:
Ensure that the tokenizer is correctly adding special tokens (including ``) by inspecting a few tokenized examples.
**Revised `tokenize_and_format_dataset` Method:**python def tokenize_and_format_dataset(self, template="This example is {}.", num_contradictions=2, max_length=128): # Convert the dataset to a Pandas DataFrame df = self.dataset.to_pandas() input_ids = [] attention_masks = [] labels = [] input_sentences = [] # To store original untruncated sentences for index, row in df.iterrows(): text = row["text"] label = row["class"] # Construct original entailment sentence entailment_text = f"
{text} {template.format(label)}" input_sentences.append(entailment_text) # Tokenize entailment example entailment_encoding = self.tokenizer( entailment_text, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=True, ) input_ids.append(entailment_encoding["input_ids"].squeeze(0)) attention_masks.append(entailment_encoding["attention_mask"].squeeze(0)) labels.append(2) # Entailment label # Construct and tokenize contradiction examples possible_contradictions = [x for x in self.labels if x != label] selected_contradictions = random.sample(possible_contradictions, num_contradictions) for contradiction_label in selected_contradictions: contradiction_text = f"{text} {template.format(contradiction_label)}" input_sentences.append(contradiction_text) contradiction_encoding = self.tokenizer( contradiction_text, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=True, ) input_ids.append(contradiction_encoding["input_ids"].squeeze(0)) attention_masks.append(contradiction_encoding["attention_mask"].squeeze(0)) labels.append(0) # Contradiction label # Combine into a Hugging Face Dataset formatted_dataset = Dataset.from_dict({ "input_ids": [seq.tolist() for seq in input_ids], "attention_mask": [mask.tolist() for mask in attention_masks], "labels": labels, "input_sentence": input_sentences, # Keep original sentences }) self.train_dataset = formatted_dataset return formatted_dataset
**Explanation of Changes:**
– **Removed Manual `` Token Manipulation**: The code that manually appends or replaces the ` ` token has been removed to prevent inconsistencies.
– **Adjusted `max_length`**: Set `max_length` directly without subtracting one, allowing the tokenizer to manage the sequence length properly.
– **Relied on Tokenizer for Special Tokens**: By setting `add_special_tokens=True`, the tokenizer will handle the addition of `` and ` ` tokens consistently.
**Additional Steps:**
– **Verify Tokenized Sequences**: Before proceeding with fine-tuning, print out some tokenized sequences to confirm that they all have exactly one `` token at the end. python print(self.tokenizer.convert_ids_to_tokens(entailment_encoding["input_ids"][0]))
– **Ensure Consistent Sequence Lengths**: By setting `padding=”max_length”`, all sequences will be padded to the same length, which is essential for batch processing.
**Outcome:**
By making these adjustments, you ensure that all input sequences have a consistent number of `` tokens, satisfying the model’s requirements and preventing the `ValueError`. The model should now be able to fine-tune on your dataset without issues.
Hugging Face Dataset Metrics
All the conversation prompts, responses, and metrics are available to download and explore on Hugging Face dataset reddgr/talking-to-chatbots-chats: