Natural Language Processing Examples

This section provides comprehensive examples for NLP tasks using Torchium’s specialized optimizers and loss functions.

Transformer Training

Large Language Model Training

import torch
import torch.nn as nn
import torchium
import math

class TransformerModel(nn.Module):
    def __init__(self, vocab_size=50000, d_model=512, nhead=8, num_layers=6, max_len=1000):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = nn.Parameter(torch.randn(max_len, d_model))
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead, batch_first=True),
            num_layers
        )
        self.classifier = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, mask=None):
        seq_len = x.size(1)
        x = self.embedding(x) * math.sqrt(self.d_model)
        x = x + self.pos_encoding[:seq_len, :].unsqueeze(0)
        x = self.dropout(x)
        x = self.transformer(x, src_key_padding_mask=mask)
        return self.classifier(x)

model = TransformerModel(vocab_size=50000, d_model=512, nhead=8, num_layers=6)

# Use LAMB for large batch training
optimizer = torchium.optimizers.LAMB(
    model.parameters(),
    lr=1e-3,
    betas=(0.9, 0.999),
    eps=1e-6,
    weight_decay=0.01,
    clamp_value=10.0
)

# Advanced NLP loss with label smoothing
criterion = torchium.losses.LabelSmoothingLoss(
    num_classes=50000,
    smoothing=0.1
)

# Training loop with gradient clipping
def train_transformer(model, optimizer, criterion, dataloader, num_epochs=100):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        for batch in dataloader:
            optimizer.zero_grad()

            # Forward pass
            output = model(batch.input_ids, batch.attention_mask)

            # Compute loss
            loss = criterion(output.view(-1, output.size(-1)), batch.labels.view(-1))

            # Backward pass
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

Sequence-to-Sequence Models

Encoder-Decoder Architecture

class Seq2SeqModel(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, d_model=512, nhead=8, num_layers=6):
        super().__init__()
        self.d_model = d_model

        # Encoder
        self.encoder_embedding = nn.Embedding(input_vocab_size, d_model)
        self.encoder_pos_encoding = nn.Parameter(torch.randn(1000, d_model))
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead, batch_first=True),
            num_layers
        )

        # Decoder
        self.decoder_embedding = nn.Embedding(output_vocab_size, d_model)
        self.decoder_pos_encoding = nn.Parameter(torch.randn(1000, d_model))
        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model, nhead, batch_first=True),
            num_layers
        )

        # Output projection
        self.output_projection = nn.Linear(d_model, output_vocab_size)
        self.dropout = nn.Dropout(0.1)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        # Encoder
        src_seq_len = src.size(1)
        src_emb = self.encoder_embedding(src) * math.sqrt(self.d_model)
        src_emb = src_emb + self.encoder_pos_encoding[:src_seq_len, :].unsqueeze(0)
        src_emb = self.dropout(src_emb)
        encoder_output = self.encoder(src_emb, src_key_padding_mask=src_mask)

        # Decoder
        tgt_seq_len = tgt.size(1)
        tgt_emb = self.decoder_embedding(tgt) * math.sqrt(self.d_model)
        tgt_emb = tgt_emb + self.decoder_pos_encoding[:tgt_seq_len, :].unsqueeze(0)
        tgt_emb = self.dropout(tgt_emb)
        decoder_output = self.decoder(tgt_emb, encoder_output,
                                     tgt_mask=tgt_mask, memory_key_padding_mask=src_mask)

        return self.output_projection(decoder_output)

model = Seq2SeqModel(input_vocab_size=30000, output_vocab_size=30000)

# Use NovoGrad for NLP tasks
optimizer = torchium.optimizers.NovoGrad(
    model.parameters(),
    lr=1e-3,
    betas=(0.9, 0.999),
    eps=1e-8,
    weight_decay=0.01,
    grad_averaging=True
)

# Combined loss for seq2seq
class Seq2SeqLoss(nn.Module):
    def __init__(self, vocab_size=30000):
        super().__init__()
        self.ce_loss = torchium.losses.CrossEntropyLoss()
        self.label_smoothing = torchium.losses.LabelSmoothingLoss(
            num_classes=vocab_size, smoothing=0.1
        )

    def forward(self, pred, target):
        ce_loss = self.ce_loss(pred, target)
        smooth_loss = self.label_smoothing(pred, target)
        return 0.7 * ce_loss + 0.3 * smooth_loss

criterion = Seq2SeqLoss()

# Training loop for seq2seq
def train_seq2seq(model, optimizer, criterion, dataloader, num_epochs=100):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        for batch in dataloader:
            optimizer.zero_grad()

            # Forward pass
            output = model(batch.src, batch.tgt, batch.src_mask, batch.tgt_mask)

            # Compute loss
            loss = criterion(output.view(-1, output.size(-1)), batch.tgt.view(-1))

            # Backward pass
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

Word Embeddings

Word2Vec Training

class Word2VecModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=300):
        super().__init__()
        self.target_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dim = embedding_dim

    def forward(self, target_words, context_words, negative_words):
        # Target embeddings
        target_emb = self.target_embeddings(target_words)

        # Context embeddings
        context_emb = self.context_embeddings(context_words)

        # Negative embeddings
        negative_emb = self.context_embeddings(negative_words)

        return target_emb, context_emb, negative_emb

model = Word2VecModel(vocab_size=100000, embedding_dim=300)

# Use SGD for word embeddings
optimizer = torchium.optimizers.SGD(
    model.parameters(),
    lr=0.025,
    momentum=0.9
)

# Word2Vec specific loss
criterion = torchium.losses.Word2VecLoss(
    vocab_size=100000,
    embedding_dim=300,
    negative_samples=5
)

# Training loop for Word2Vec
def train_word2vec(model, optimizer, criterion, dataloader, num_epochs=100):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        for batch in dataloader:
            optimizer.zero_grad()

            # Forward pass
            target_emb, context_emb, negative_emb = model(
                batch.target_words, batch.context_words, batch.negative_words
            )

            # Compute loss
            loss = criterion(target_emb, context_emb, negative_emb)

            # Backward pass
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

GloVe Training

class GloVeModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=300):
        super().__init__()
        self.target_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.target_bias = nn.Embedding(vocab_size, 1)
        self.context_bias = nn.Embedding(vocab_size, 1)

    def forward(self, target_words, context_words, cooccurrence_counts):
        # Target embeddings
        target_emb = self.target_embeddings(target_words)
        target_bias = self.target_bias(target_words)

        # Context embeddings
        context_emb = self.context_embeddings(context_words)
        context_bias = self.context_bias(context_words)

        return target_emb, context_emb, target_bias, context_bias, cooccurrence_counts

model = GloVeModel(vocab_size=100000, embedding_dim=300)

# Use Adam for GloVe
optimizer = torchium.optimizers.Adam(
    model.parameters(),
    lr=0.05
)

# GloVe specific loss
criterion = torchium.losses.GloVeLoss(
    vocab_size=100000,
    embedding_dim=300,
    x_max=100.0,
    alpha=0.75
)

# Training loop for GloVe
def train_glove(model, optimizer, criterion, dataloader, num_epochs=100):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        for batch in dataloader:
            optimizer.zero_grad()

            # Forward pass
            target_emb, context_emb, target_bias, context_bias, cooccurrence_counts = model(
                batch.target_words, batch.context_words, batch.cooccurrence_counts
            )

            # Compute loss
            loss = criterion(target_emb, context_emb, target_bias, context_bias, cooccurrence_counts)

            # Backward pass
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

Text Classification

BERT-style Classification

class BERTClassifier(nn.Module):
    def __init__(self, vocab_size, d_model=768, nhead=12, num_layers=12, num_classes=2):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = nn.Parameter(torch.randn(512, d_model))
        self.segment_embedding = nn.Embedding(2, d_model)

        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead, batch_first=True),
            num_layers
        )

        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(d_model, num_classes)
        )

    def forward(self, input_ids, attention_mask, segment_ids):
        seq_len = input_ids.size(1)
        x = self.embedding(input_ids) * math.sqrt(self.d_model)
        x = x + self.pos_encoding[:seq_len, :].unsqueeze(0)
        x = x + self.segment_embedding(segment_ids)

        x = self.transformer(x, src_key_padding_mask=attention_mask)

        # Use [CLS] token for classification
        cls_output = x[:, 0, :]
        return self.classifier(cls_output)

model = BERTClassifier(vocab_size=30000, num_classes=2)

# Use AdamW for BERT-style training
optimizer = torchium.optimizers.AdamW(
    model.parameters(),
    lr=2e-5,
    betas=(0.9, 0.999),
    eps=1e-8,
    weight_decay=0.01
)

# Classification loss with label smoothing
criterion = torchium.losses.LabelSmoothingLoss(
    num_classes=2,
    smoothing=0.1
)

# Training loop for BERT classification
def train_bert_classifier(model, optimizer, criterion, dataloader, num_epochs=10):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        for batch in dataloader:
            optimizer.zero_grad()

            # Forward pass
            output = model(batch.input_ids, batch.attention_mask, batch.segment_ids)

            # Compute loss
            loss = criterion(output, batch.labels)

            # Backward pass
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

Named Entity Recognition

CRF-based NER

class NERModel(nn.Module):
    def __init__(self, vocab_size, num_tags, embedding_dim=128, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.classifier = nn.Linear(hidden_dim * 2, num_tags)

    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)
        x, _ = self.lstm(x)
        x = self.classifier(x)
        return x

model = NERModel(vocab_size=30000, num_tags=9)  # BIO tagging scheme

# Use AdamW for NER
optimizer = torchium.optimizers.AdamW(
    model.parameters(),
    lr=1e-3,
    weight_decay=1e-4
)

# CRF loss for NER
criterion = torchium.losses.CRFLoss(
    num_tags=9,
    batch_first=True
)

# Training loop for NER
def train_ner_model(model, optimizer, criterion, dataloader, num_epochs=100):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        for batch in dataloader:
            optimizer.zero_grad()

            # Forward pass
            output = model(batch.input_ids, batch.attention_mask)

            # Compute loss
            loss = criterion(output, batch.tags, batch.attention_mask)

            # Backward pass
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

Text Generation

GPT-style Generation

class GPTModel(nn.Module):
    def __init__(self, vocab_size, d_model=768, nhead=12, num_layers=12, max_len=1024):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = nn.Parameter(torch.randn(max_len, d_model))

        self.transformer = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model, nhead, batch_first=True),
            num_layers
        )

        self.classifier = nn.Linear(d_model, vocab_size)

    def forward(self, input_ids, attention_mask=None):
        seq_len = input_ids.size(1)
        x = self.embedding(input_ids) * math.sqrt(self.d_model)
        x = x + self.pos_encoding[:seq_len, :].unsqueeze(0)

        # Create causal mask
        tgt_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()

        x = self.transformer(x, x, tgt_mask=tgt_mask, memory_key_padding_mask=attention_mask)
        return self.classifier(x)

model = GPTModel(vocab_size=50000, d_model=768, nhead=12, num_layers=12)

# Use AdamW for GPT training
optimizer = torchium.optimizers.AdamW(
    model.parameters(),
    lr=1e-4,
    betas=(0.9, 0.95),
    eps=1e-8,
    weight_decay=0.1
)

# Perplexity loss for text generation
criterion = torchium.losses.PerplexityLoss()

# Training loop for GPT
def train_gpt_model(model, optimizer, criterion, dataloader, num_epochs=100):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        for batch in dataloader:
            optimizer.zero_grad()

            # Forward pass
            output = model(batch.input_ids, batch.attention_mask)

            # Compute loss
            loss = criterion(output, batch.target_ids)

            # Backward pass
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

Multi-Task NLP

Multi-Task Learning for NLP

class MultiTaskNLPModel(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_layers=6):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = nn.Parameter(torch.randn(1000, d_model))

        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead, batch_first=True),
            num_layers
        )

        # Task-specific heads
        self.classifier = nn.Linear(d_model, 2)      # Sentiment analysis
        self.ner_classifier = nn.Linear(d_model, 9)  # Named entity recognition
        self.qa_classifier = nn.Linear(d_model, 2)   # Question answering

    def forward(self, input_ids, attention_mask, task_type):
        seq_len = input_ids.size(1)
        x = self.embedding(input_ids) * math.sqrt(self.d_model)
        x = x + self.pos_encoding[:seq_len, :].unsqueeze(0)

        x = self.transformer(x, src_key_padding_mask=attention_mask)

        # Use [CLS] token for classification tasks
        cls_output = x[:, 0, :]

        if task_type == 'classification':
            return self.classifier(cls_output)
        elif task_type == 'ner':
            return self.ner_classifier(x)
        elif task_type == 'qa':
            return self.qa_classifier(cls_output)

model = MultiTaskNLPModel(vocab_size=30000)

# Use PCGrad for multi-task learning
optimizer = torchium.optimizers.PCGrad(
    model.parameters(),
    lr=1e-3
)

# Multi-task loss with uncertainty weighting
class MultiTaskNLPLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.uncertainty_loss = torchium.losses.UncertaintyWeightingLoss(num_tasks=3)
        self.cls_loss = torchium.losses.CrossEntropyLoss()
        self.ner_loss = torchium.losses.CRFLoss(num_tags=9, batch_first=True)
        self.qa_loss = torchium.losses.CrossEntropyLoss()

    def forward(self, cls_pred, ner_pred, qa_pred, cls_target, ner_target, qa_target, attention_mask):
        cls_loss = self.cls_loss(cls_pred, cls_target)
        ner_loss = self.ner_loss(ner_pred, ner_target, attention_mask)
        qa_loss = self.qa_loss(qa_pred, qa_target)

        return self.uncertainty_loss([cls_loss, ner_loss, qa_loss])

criterion = MultiTaskNLPLoss()

# Training loop for multi-task NLP
def train_multitask_nlp(model, optimizer, criterion, dataloader, num_epochs=100):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        for batch in dataloader:
            optimizer.zero_grad()

            # Forward pass for each task
            cls_pred = model(batch.input_ids, batch.attention_mask, 'classification')
            ner_pred = model(batch.input_ids, batch.attention_mask, 'ner')
            qa_pred = model(batch.input_ids, batch.attention_mask, 'qa')

            # Compute loss
            loss = criterion(cls_pred, ner_pred, qa_pred,
                            batch.cls_targets, batch.ner_targets, batch.qa_targets,
                            batch.attention_mask)

            # Backward pass with gradient surgery
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

These examples demonstrate the power of Torchium’s specialized optimizers and loss functions for various NLP tasks. Each example shows how to combine different components effectively for optimal performance in natural language processing.