Identifying and fixing a ValueError in NLI model fine-tuning process.

Loading data from Talking to Chatbots Dataset reddgr/talking-to-chatbots-chats

😎 # nli_finetuning_utils.py

import pandas as pd
import random
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch import cuda

class NLIModelFineTuner:
def __init__(self, dataset, labels, model, tokenizer):
self.dataset = dataset
self.labels = labels
self.model = model
self.tokenizer = tokenizer
self.train_dataset = None

def tokenize_and_format_dataset(self, template=”This example is {}.”, num_contradictions=2, max_length=128):
“””
Tokenizes and formats the dataset for fine-tuning.

Args:
template (str): Template for entailment and contradiction examples.
num_contradictions (int): Number of contradiction examples per input.
max_length (int): Maximum sequence length for padding/truncation.

Returns:
formatted_dataset: Hugging Face Dataset with tokenized examples.
“””
# Convert the dataset to a Pandas DataFrame
df = self.dataset.to_pandas()

input_ids = []
attention_masks = []
labels = []
input_sentences = [] # To store original untruncated sentences

eos_token_id = self.tokenizer.eos_token_id # Ensure consistency for eos tokens

for index, row in df.iterrows():
text = row[“text”]
label = row[“class”]

# Construct original entailment sentence
entailment_text = f”{text} {template.format(label)}”
input_sentences.append(entailment_text)

# Tokenize entailment example
entailment_encoding = self.tokenizer(
entailment_text,
max_length=max_length – 1, # Leave space for token
padding=”max_length”,
truncation=True,
return_tensors=”pt”,
add_special_tokens=True,
)
# Append eos token if missing
#if entailment_encoding[“input_ids”][0, -1] != eos_token_id:
# entailment_encoding[“input_ids”][0, -1] = eos_token_id

input_ids.append(entailment_encoding[“input_ids”].squeeze(0))
attention_masks.append(entailment_encoding[“attention_mask”].squeeze(0))
labels.append(2) # Entailment label

# Construct and tokenize contradiction examples
possible_contradictions = [x for x in self.labels if x != label]
selected_contradictions = random.sample(possible_contradictions, num_contradictions)
for contradiction_label in selected_contradictions:
contradiction_text = f”{text} {template.format(contradiction_label)}”
input_sentences.append(contradiction_text)

contradiction_encoding = self.tokenizer(
contradiction_text,
max_length=max_length – 1, # Leave space for token
padding=”max_length”,
truncation=True,
return_tensors=”pt”,
add_special_tokens=True,
)
# Append eos token if missing
if contradiction_encoding[“input_ids”][0, -1] != eos_token_id:
contradiction_encoding[“input_ids”][0, -1] = eos_token_id

input_ids.append(contradiction_encoding[“input_ids”].squeeze(0))
attention_masks.append(contradiction_encoding[“attention_mask”].squeeze(0))
labels.append(0) # Contradiction label

# Combine into a Hugging Face Dataset
formatted_dataset = Dataset.from_dict({
“input_ids”: [seq.tolist() for seq in input_ids],
“attention_mask”: [mask.tolist() for mask in attention_masks],
“labels”: labels,
“input_sentence”: input_sentences, # Keep original sentences
})
self.train_dataset = formatted_dataset
return formatted_dataset

def fine_tune(self, output_dir=”./results”, epochs=3, batch_size=8, learning_rate=5e-5):
“””
Fine-tune the NLI model.

Args:
output_dir (str): Directory to save model checkpoints.
epochs (int): Number of training epochs.
batch_size (int): Batch size for training.
learning_rate (float): Learning rate for the optimizer.
“””
device = “cuda:0” if cuda.is_available() else “cpu”
print(f”Using device: {device}”)
self.model = self.model.to(device)

# Verify that the dataset is PyTorch-compatible
self.train_dataset.set_format(type=”torch”, columns=[“input_ids”, “attention_mask”, “labels”])

training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
save_steps=10,
save_total_limit=2,
learning_rate=learning_rate,
logging_dir=f”{output_dir}/logs”,
evaluation_strategy=”epoch”,
#report_to=”none”,
)

trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=self.train_dataset
)

print(“Fine-tuning in progress…”)
trainer.train()
print(f”Fine-tuning complete. Model saved to {output_dir}.”)
return trainer


def OLD_tokenize_and_format_dataset(self, template=”This example is {}.”, num_contradictions=2):
# Convert the dataset to a Pandas DataFrame
df = self.dataset.to_pandas()

entailment_input_ids = []
contradiction_input_ids = []
attention_masks = []
labels_list = []
input_sentences = []

for index, row in df.iterrows():
text = row[“text”]
label = row[“class”]

# Encode the entailment example
encoded_text = self.tokenizer.encode(f”{text}“, add_special_tokens=False)
entailment_ids = encoded_text + self.tokenizer.encode(
f” {template.format(label)}”, add_special_tokens=False
)

# Add entailment example
entailment_input_ids.append(entailment_ids)
attention_masks.append([1] * len(entailment_ids))
labels_list.append(2) # Entailment label
input_sentences.append(f”{text} {template.format(label)}”)

# Create contradiction examples
possible_contradictions = [x for x in self.labels if x != label]
selected_contradictions = random.sample(possible_contradictions, num_contradictions)

for contradiction_label in selected_contradictions:
contradiction_ids = encoded_text + self.tokenizer.encode(
f” {template.format(contradiction_label)}”, add_special_tokens=False
)
contradiction_input_ids.append(contradiction_ids)
attention_masks.append([1] * len(contradiction_ids))
labels_list.append(0) # Contradiction label
input_sentences.append(f”{text} {template.format(contradiction_label)}”)

# Combine entailment and contradiction data
transformed_df = pd.DataFrame({
“input_ids”: entailment_input_ids + contradiction_input_ids,
“attention_mask”: attention_masks,
“labels”: labels_list,
“input_sentence”: input_sentences,
})

# Convert the DataFrame back to a Hugging Face Dataset
formatted_dataset = Dataset.from_pandas(transformed_df)
self.train_dataset = formatted_dataset
return formatted_dataset



# code that uses the module:

import transformers
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import random
from datasets import load_dataset, Dataset
import pandas as pd
import nli_finetuning_utils
import sys
import torch

# Load dataset and initialize parameters
dataset = load_dataset(“fancyzhx/ag_news”, split=”test”)



use_sampled_dataset=False
if use_sampled_dataset:
dataset = dataset.shuffle(seed=42).select(range(20))

dataset = dataset.select(range(20))
labels = [“World”, “Sports”, “Business”, “Sci/Tech”]
dataset = dataset.map(lambda x: {“class”: labels[x[“label”]]}, remove_columns=[“label”])
nli_tokenizer = AutoTokenizer.from_pretrained(‘facebook/bart-large-mnli’, clean_up_tokenization_spaces=True)
nli_model = AutoModelForSequenceClassification.from_pretrained(‘facebook/bart-large-mnli’, clean_up_tokenization_spaces=True)
print(f”Loaded NLI model with head:\n{nli_model.classification_head.out_proj}\n{nli_model.config.id2label}”)

# Instantiate the NLIModelFineTuner class
fine_tuner = nli_finetuning_utils.NLIModelFineTuner(dataset, labels, nli_model, nli_tokenizer)

# Tokenize and format the dataset
num_contradictions = 3
template = “This example is a {} prompt.”
train_dataset = fine_tuner.tokenize_and_format_dataset(template=template, num_contradictions=num_contradictions, max_length=128)

fine_tuner.fine_tune(output_dir=”./models”, epochs=3, batch_size=8, learning_rate=0.0001)


# ERROR RAISED
—————————————————————————
ValueError Traceback (most recent call last)
Cell In[5], line 1
—-> 1 fine_tuner.fine_tune(output_dir=”./models”, epochs=3, batch_size=8, learning_rate=0.0001)

File [PATH]\Documents\python_scripts\nli_finetuning\nli_finetuning_utils.py:132, in NLIModelFineTuner.fine_tune(self, output_dir, epochs, batch_size, learning_rate)
125 trainer = Trainer(
126 model=self.model,
127 args=training_args,
128 train_dataset=self.train_dataset
129 )
131 print(“Fine-tuning in progress…”)
–> 132 trainer.train()
133 print(f”Fine-tuning complete. Model saved to {output_dir}.”)
134 return trainer

File [PATH]\anaconda3\Lib\site-packages\transformers\trainer.py:1938, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1936 hf_hub_utils.enable_progress_bars()
1937 else:
-> 1938 return inner_training_loop(
1939 args=args,
1940 resume_from_checkpoint=resume_from_checkpoint,
1941 trial=trial,
1942 ignore_keys_for_eval=ignore_keys_for_eval,
1943 )

1824 :, -1, :
1825 ]
1826 logits = self.classification_head(sentence_representation)

ValueError: All examples must have the same number of tokens.


TRY TO SPOT THE ERROR AND WAY TO FIX
🤖 The error you’re encountering arises because the BART model you’re fine-tuning expects all input sequences in the batch to have the same number of `` tokens. In your current code, the number of `` tokens varies across different sequences, causing the `ValueError: All examples must have the same number of tokens`.

**Here’s why the error occurs:**

– **Inconsistent Handling of `` Tokens**: In your `tokenize_and_format_dataset` method, you have commented out the code that appends an `` token to the entailment examples:

python
  # Append eos token if missing
  # if entailment_encoding["input_ids"][0, -1] != eos_token_id:
  #     entailment_encoding["input_ids"][0, -1] = eos_token_id
  


However, for contradiction examples, you have this code active:

python
  # Append eos token if missing
  if contradiction_encoding["input_ids"][0, -1] != eos_token_id:
      contradiction_encoding["input_ids"][0, -1] = eos_token_id
  


This inconsistency leads to some sequences having an `` token at the end while others do not.

– **Manual Manipulation of `` Tokens**: Manually appending `` tokens and adjusting `max_length` by subtracting one (`max_length – 1`) can lead to unintended sequence lengths and token placements, especially when padding and truncation are involved.

**How to Fix the Error:**

1. **Uncomment and Correct the `` Token Handling for Entailment Examples**:

Ensure that the `` token is appended consistently across all examples. Uncomment and adjust the code for entailment examples:

python
   # Append eos token if missing
   if entailment_encoding["input_ids"][0, -1] != eos_token_id:
       entailment_encoding["input_ids"][0, -1] = eos_token_id
   


2. **Avoid Manual Manipulation of `` Tokens**:

Let the tokenizer handle the special tokens automatically by setting `add_special_tokens=True`. Remove any code that manually appends or manipulates `` tokens.

3. **Set `max_length` Appropriately**:

Do not subtract one from `max_length`. Set `max_length` to the desired sequence length and let the tokenizer handle truncation and padding.

4. **Verify Tokenizer Behavior**:

Ensure that the tokenizer is correctly adding special tokens (including ``) by inspecting a few tokenized examples.

**Revised `tokenize_and_format_dataset` Method:**

python
def tokenize_and_format_dataset(self, template="This example is {}.", num_contradictions=2, max_length=128):
    # Convert the dataset to a Pandas DataFrame
    df = self.dataset.to_pandas()
    
    input_ids = []
    attention_masks = []
    labels = []
    input_sentences = []  # To store original untruncated sentences

    for index, row in df.iterrows():
        text = row["text"]
        label = row["class"]

        # Construct original entailment sentence
        entailment_text = f"{text} {template.format(label)}"
        input_sentences.append(entailment_text)

        # Tokenize entailment example
        entailment_encoding = self.tokenizer(
            entailment_text,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            add_special_tokens=True,
        )

        input_ids.append(entailment_encoding["input_ids"].squeeze(0))
        attention_masks.append(entailment_encoding["attention_mask"].squeeze(0))
        labels.append(2)  # Entailment label

        # Construct and tokenize contradiction examples
        possible_contradictions = [x for x in self.labels if x != label]
        selected_contradictions = random.sample(possible_contradictions, num_contradictions)
        for contradiction_label in selected_contradictions:
            contradiction_text = f"{text} {template.format(contradiction_label)}"
            input_sentences.append(contradiction_text)

            contradiction_encoding = self.tokenizer(
                contradiction_text,
                max_length=max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
                add_special_tokens=True,
            )

            input_ids.append(contradiction_encoding["input_ids"].squeeze(0))
            attention_masks.append(contradiction_encoding["attention_mask"].squeeze(0))
            labels.append(0)  # Contradiction label

    # Combine into a Hugging Face Dataset
    formatted_dataset = Dataset.from_dict({
        "input_ids": [seq.tolist() for seq in input_ids],
        "attention_mask": [mask.tolist() for mask in attention_masks],
        "labels": labels,
        "input_sentence": input_sentences,  # Keep original sentences
    })
    self.train_dataset = formatted_dataset
    return formatted_dataset



**Explanation of Changes:**

– **Removed Manual `` Token Manipulation**: The code that manually appends or replaces the `` token has been removed to prevent inconsistencies.

– **Adjusted `max_length`**: Set `max_length` directly without subtracting one, allowing the tokenizer to manage the sequence length properly.

– **Relied on Tokenizer for Special Tokens**: By setting `add_special_tokens=True`, the tokenizer will handle the addition of `` and `` tokens consistently.

**Additional Steps:**

– **Verify Tokenized Sequences**: Before proceeding with fine-tuning, print out some tokenized sequences to confirm that they all have exactly one `` token at the end.

python
  print(self.tokenizer.convert_ids_to_tokens(entailment_encoding["input_ids"][0]))
  


– **Ensure Consistent Sequence Lengths**: By setting `padding=”max_length”`, all sequences will be padded to the same length, which is essential for batch processing.

**Outcome:**

By making these adjustments, you ensure that all input sequences have a consistent number of `` tokens, satisfying the model’s requirements and preventing the `ValueError`. The model should now be able to fine-tune on your dataset without issues.

Hugging Face Dataset Metrics

All the conversation prompts, responses, and metrics are available to download and explore on Hugging Face dataset reddgr/talking-to-chatbots-chats: