Natural Language Processing Examples
====================================

This section provides comprehensive examples for NLP tasks using Torchium's specialized optimizers and loss functions.

Transformer Training
--------------------

Large Language Model Training
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. code-block:: python

   import torch
   import torch.nn as nn
   import torchium
   import math

   class TransformerModel(nn.Module):
       def __init__(self, vocab_size=50000, d_model=512, nhead=8, num_layers=6, max_len=1000):
           super().__init__()
           self.d_model = d_model
           self.embedding = nn.Embedding(vocab_size, d_model)
           self.pos_encoding = nn.Parameter(torch.randn(max_len, d_model))
           self.transformer = nn.TransformerEncoder(
               nn.TransformerEncoderLayer(d_model, nhead, batch_first=True),
               num_layers
           )
           self.classifier = nn.Linear(d_model, vocab_size)
           self.dropout = nn.Dropout(0.1)

       def forward(self, x, mask=None):
           seq_len = x.size(1)
           x = self.embedding(x) * math.sqrt(self.d_model)
           x = x + self.pos_encoding[:seq_len, :].unsqueeze(0)
           x = self.dropout(x)
           x = self.transformer(x, src_key_padding_mask=mask)
           return self.classifier(x)

   model = TransformerModel(vocab_size=50000, d_model=512, nhead=8, num_layers=6)

   # Use LAMB for large batch training
   optimizer = torchium.optimizers.LAMB(
       model.parameters(),
       lr=1e-3,
       betas=(0.9, 0.999),
       eps=1e-6,
       weight_decay=0.01,
       clamp_value=10.0
   )

   # Advanced NLP loss with label smoothing
   criterion = torchium.losses.LabelSmoothingLoss(
       num_classes=50000,
       smoothing=0.1
   )

   # Training loop with gradient clipping
   def train_transformer(model, optimizer, criterion, dataloader, num_epochs=100):
       model.train()
       
       for epoch in range(num_epochs):
           total_loss = 0
           
           for batch in dataloader:
               optimizer.zero_grad()
               
               # Forward pass
               output = model(batch.input_ids, batch.attention_mask)
               
               # Compute loss
               loss = criterion(output.view(-1, output.size(-1)), batch.labels.view(-1))
               
               # Backward pass
               loss.backward()
               
               # Gradient clipping
               torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
               
               optimizer.step()
               
               total_loss += loss.item()
           
           avg_loss = total_loss / len(dataloader)
           print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

Sequence-to-Sequence Models
---------------------------

Encoder-Decoder Architecture
~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. code-block:: python

   class Seq2SeqModel(nn.Module):
       def __init__(self, input_vocab_size, output_vocab_size, d_model=512, nhead=8, num_layers=6):
           super().__init__()
           self.d_model = d_model
           
           # Encoder
           self.encoder_embedding = nn.Embedding(input_vocab_size, d_model)
           self.encoder_pos_encoding = nn.Parameter(torch.randn(1000, d_model))
           self.encoder = nn.TransformerEncoder(
               nn.TransformerEncoderLayer(d_model, nhead, batch_first=True),
               num_layers
           )
           
           # Decoder
           self.decoder_embedding = nn.Embedding(output_vocab_size, d_model)
           self.decoder_pos_encoding = nn.Parameter(torch.randn(1000, d_model))
           self.decoder = nn.TransformerDecoder(
               nn.TransformerDecoderLayer(d_model, nhead, batch_first=True),
               num_layers
           )
           
           # Output projection
           self.output_projection = nn.Linear(d_model, output_vocab_size)
           self.dropout = nn.Dropout(0.1)

       def forward(self, src, tgt, src_mask=None, tgt_mask=None):
           # Encoder
           src_seq_len = src.size(1)
           src_emb = self.encoder_embedding(src) * math.sqrt(self.d_model)
           src_emb = src_emb + self.encoder_pos_encoding[:src_seq_len, :].unsqueeze(0)
           src_emb = self.dropout(src_emb)
           encoder_output = self.encoder(src_emb, src_key_padding_mask=src_mask)
           
           # Decoder
           tgt_seq_len = tgt.size(1)
           tgt_emb = self.decoder_embedding(tgt) * math.sqrt(self.d_model)
           tgt_emb = tgt_emb + self.decoder_pos_encoding[:tgt_seq_len, :].unsqueeze(0)
           tgt_emb = self.dropout(tgt_emb)
           decoder_output = self.decoder(tgt_emb, encoder_output, 
                                        tgt_mask=tgt_mask, memory_key_padding_mask=src_mask)
           
           return self.output_projection(decoder_output)

   model = Seq2SeqModel(input_vocab_size=30000, output_vocab_size=30000)

   # Use NovoGrad for NLP tasks
   optimizer = torchium.optimizers.NovoGrad(
       model.parameters(),
       lr=1e-3,
       betas=(0.9, 0.999),
       eps=1e-8,
       weight_decay=0.01,
       grad_averaging=True
   )

   # Combined loss for seq2seq
   class Seq2SeqLoss(nn.Module):
       def __init__(self, vocab_size=30000):
           super().__init__()
           self.ce_loss = torchium.losses.CrossEntropyLoss()
           self.label_smoothing = torchium.losses.LabelSmoothingLoss(
               num_classes=vocab_size, smoothing=0.1
           )

       def forward(self, pred, target):
           ce_loss = self.ce_loss(pred, target)
           smooth_loss = self.label_smoothing(pred, target)
           return 0.7 * ce_loss + 0.3 * smooth_loss

   criterion = Seq2SeqLoss()

   # Training loop for seq2seq
   def train_seq2seq(model, optimizer, criterion, dataloader, num_epochs=100):
       model.train()
       
       for epoch in range(num_epochs):
           total_loss = 0
           
           for batch in dataloader:
               optimizer.zero_grad()
               
               # Forward pass
               output = model(batch.src, batch.tgt, batch.src_mask, batch.tgt_mask)
               
               # Compute loss
               loss = criterion(output.view(-1, output.size(-1)), batch.tgt.view(-1))
               
               # Backward pass
               loss.backward()
               
               # Gradient clipping
               torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
               
               optimizer.step()
               
               total_loss += loss.item()
           
           avg_loss = total_loss / len(dataloader)
           print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

Word Embeddings
---------------

Word2Vec Training
~~~~~~~~~~~~~~~~~

.. code-block:: python

   class Word2VecModel(nn.Module):
       def __init__(self, vocab_size, embedding_dim=300):
           super().__init__()
           self.target_embeddings = nn.Embedding(vocab_size, embedding_dim)
           self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)
           self.embedding_dim = embedding_dim

       def forward(self, target_words, context_words, negative_words):
           # Target embeddings
           target_emb = self.target_embeddings(target_words)
           
           # Context embeddings
           context_emb = self.context_embeddings(context_words)
           
           # Negative embeddings
           negative_emb = self.context_embeddings(negative_words)
           
           return target_emb, context_emb, negative_emb

   model = Word2VecModel(vocab_size=100000, embedding_dim=300)

   # Use SGD for word embeddings
   optimizer = torchium.optimizers.SGD(
       model.parameters(),
       lr=0.025,
       momentum=0.9
   )

   # Word2Vec specific loss
   criterion = torchium.losses.Word2VecLoss(
       vocab_size=100000,
       embedding_dim=300,
       negative_samples=5
   )

   # Training loop for Word2Vec
   def train_word2vec(model, optimizer, criterion, dataloader, num_epochs=100):
       model.train()
       
       for epoch in range(num_epochs):
           total_loss = 0
           
           for batch in dataloader:
               optimizer.zero_grad()
               
               # Forward pass
               target_emb, context_emb, negative_emb = model(
                   batch.target_words, batch.context_words, batch.negative_words
               )
               
               # Compute loss
               loss = criterion(target_emb, context_emb, negative_emb)
               
               # Backward pass
               loss.backward()
               optimizer.step()
               
               total_loss += loss.item()
           
           avg_loss = total_loss / len(dataloader)
           print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

GloVe Training
~~~~~~~~~~~~~~

.. code-block:: python

   class GloVeModel(nn.Module):
       def __init__(self, vocab_size, embedding_dim=300):
           super().__init__()
           self.target_embeddings = nn.Embedding(vocab_size, embedding_dim)
           self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)
           self.target_bias = nn.Embedding(vocab_size, 1)
           self.context_bias = nn.Embedding(vocab_size, 1)

       def forward(self, target_words, context_words, cooccurrence_counts):
           # Target embeddings
           target_emb = self.target_embeddings(target_words)
           target_bias = self.target_bias(target_words)
           
           # Context embeddings
           context_emb = self.context_embeddings(context_words)
           context_bias = self.context_bias(context_words)
           
           return target_emb, context_emb, target_bias, context_bias, cooccurrence_counts

   model = GloVeModel(vocab_size=100000, embedding_dim=300)

   # Use Adam for GloVe
   optimizer = torchium.optimizers.Adam(
       model.parameters(),
       lr=0.05
   )

   # GloVe specific loss
   criterion = torchium.losses.GloVeLoss(
       vocab_size=100000,
       embedding_dim=300,
       x_max=100.0,
       alpha=0.75
   )

   # Training loop for GloVe
   def train_glove(model, optimizer, criterion, dataloader, num_epochs=100):
       model.train()
       
       for epoch in range(num_epochs):
           total_loss = 0
           
           for batch in dataloader:
               optimizer.zero_grad()
               
               # Forward pass
               target_emb, context_emb, target_bias, context_bias, cooccurrence_counts = model(
                   batch.target_words, batch.context_words, batch.cooccurrence_counts
               )
               
               # Compute loss
               loss = criterion(target_emb, context_emb, target_bias, context_bias, cooccurrence_counts)
               
               # Backward pass
               loss.backward()
               optimizer.step()
               
               total_loss += loss.item()
           
           avg_loss = total_loss / len(dataloader)
           print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

Text Classification
-------------------

BERT-style Classification
~~~~~~~~~~~~~~~~~~~~~~~~~

.. code-block:: python

   class BERTClassifier(nn.Module):
       def __init__(self, vocab_size, d_model=768, nhead=12, num_layers=12, num_classes=2):
           super().__init__()
           self.d_model = d_model
           self.embedding = nn.Embedding(vocab_size, d_model)
           self.pos_encoding = nn.Parameter(torch.randn(512, d_model))
           self.segment_embedding = nn.Embedding(2, d_model)
           
           self.transformer = nn.TransformerEncoder(
               nn.TransformerEncoderLayer(d_model, nhead, batch_first=True),
               num_layers
           )
           
           self.classifier = nn.Sequential(
               nn.Linear(d_model, d_model),
               nn.ReLU(),
               nn.Dropout(0.1),
               nn.Linear(d_model, num_classes)
           )

       def forward(self, input_ids, attention_mask, segment_ids):
           seq_len = input_ids.size(1)
           x = self.embedding(input_ids) * math.sqrt(self.d_model)
           x = x + self.pos_encoding[:seq_len, :].unsqueeze(0)
           x = x + self.segment_embedding(segment_ids)
           
           x = self.transformer(x, src_key_padding_mask=attention_mask)
           
           # Use [CLS] token for classification
           cls_output = x[:, 0, :]
           return self.classifier(cls_output)

   model = BERTClassifier(vocab_size=30000, num_classes=2)

   # Use AdamW for BERT-style training
   optimizer = torchium.optimizers.AdamW(
       model.parameters(),
       lr=2e-5,
       betas=(0.9, 0.999),
       eps=1e-8,
       weight_decay=0.01
   )

   # Classification loss with label smoothing
   criterion = torchium.losses.LabelSmoothingLoss(
       num_classes=2,
       smoothing=0.1
   )

   # Training loop for BERT classification
   def train_bert_classifier(model, optimizer, criterion, dataloader, num_epochs=10):
       model.train()
       
       for epoch in range(num_epochs):
           total_loss = 0
           
           for batch in dataloader:
               optimizer.zero_grad()
               
               # Forward pass
               output = model(batch.input_ids, batch.attention_mask, batch.segment_ids)
               
               # Compute loss
               loss = criterion(output, batch.labels)
               
               # Backward pass
               loss.backward()
               
               # Gradient clipping
               torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
               
               optimizer.step()
               
               total_loss += loss.item()
           
           avg_loss = total_loss / len(dataloader)
           print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

Named Entity Recognition
------------------------

CRF-based NER
~~~~~~~~~~~~~

.. code-block:: python

   class NERModel(nn.Module):
       def __init__(self, vocab_size, num_tags, embedding_dim=128, hidden_dim=256):
           super().__init__()
           self.embedding = nn.Embedding(vocab_size, embedding_dim)
           self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
           self.classifier = nn.Linear(hidden_dim * 2, num_tags)

       def forward(self, input_ids, attention_mask):
           x = self.embedding(input_ids)
           x, _ = self.lstm(x)
           x = self.classifier(x)
           return x

   model = NERModel(vocab_size=30000, num_tags=9)  # BIO tagging scheme

   # Use AdamW for NER
   optimizer = torchium.optimizers.AdamW(
       model.parameters(),
       lr=1e-3,
       weight_decay=1e-4
   )

   # CRF loss for NER
   criterion = torchium.losses.CRFLoss(
       num_tags=9,
       batch_first=True
   )

   # Training loop for NER
   def train_ner_model(model, optimizer, criterion, dataloader, num_epochs=100):
       model.train()
       
       for epoch in range(num_epochs):
           total_loss = 0
           
           for batch in dataloader:
               optimizer.zero_grad()
               
               # Forward pass
               output = model(batch.input_ids, batch.attention_mask)
               
               # Compute loss
               loss = criterion(output, batch.tags, batch.attention_mask)
               
               # Backward pass
               loss.backward()
               
               # Gradient clipping
               torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
               
               optimizer.step()
               
               total_loss += loss.item()
           
           avg_loss = total_loss / len(dataloader)
           print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

Text Generation
---------------

GPT-style Generation
~~~~~~~~~~~~~~~~~~~~

.. code-block:: python

   class GPTModel(nn.Module):
       def __init__(self, vocab_size, d_model=768, nhead=12, num_layers=12, max_len=1024):
           super().__init__()
           self.d_model = d_model
           self.embedding = nn.Embedding(vocab_size, d_model)
           self.pos_encoding = nn.Parameter(torch.randn(max_len, d_model))
           
           self.transformer = nn.TransformerDecoder(
               nn.TransformerDecoderLayer(d_model, nhead, batch_first=True),
               num_layers
           )
           
           self.classifier = nn.Linear(d_model, vocab_size)

       def forward(self, input_ids, attention_mask=None):
           seq_len = input_ids.size(1)
           x = self.embedding(input_ids) * math.sqrt(self.d_model)
           x = x + self.pos_encoding[:seq_len, :].unsqueeze(0)
           
           # Create causal mask
           tgt_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
           
           x = self.transformer(x, x, tgt_mask=tgt_mask, memory_key_padding_mask=attention_mask)
           return self.classifier(x)

   model = GPTModel(vocab_size=50000, d_model=768, nhead=12, num_layers=12)

   # Use AdamW for GPT training
   optimizer = torchium.optimizers.AdamW(
       model.parameters(),
       lr=1e-4,
       betas=(0.9, 0.95),
       eps=1e-8,
       weight_decay=0.1
   )

   # Perplexity loss for text generation
   criterion = torchium.losses.PerplexityLoss()

   # Training loop for GPT
   def train_gpt_model(model, optimizer, criterion, dataloader, num_epochs=100):
       model.train()
       
       for epoch in range(num_epochs):
           total_loss = 0
           
           for batch in dataloader:
               optimizer.zero_grad()
               
               # Forward pass
               output = model(batch.input_ids, batch.attention_mask)
               
               # Compute loss
               loss = criterion(output, batch.target_ids)
               
               # Backward pass
               loss.backward()
               
               # Gradient clipping
               torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
               
               optimizer.step()
               
               total_loss += loss.item()
           
           avg_loss = total_loss / len(dataloader)
           print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

Multi-Task NLP
--------------

Multi-Task Learning for NLP
~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. code-block:: python

   class MultiTaskNLPModel(nn.Module):
       def __init__(self, vocab_size, d_model=512, nhead=8, num_layers=6):
           super().__init__()
           self.d_model = d_model
           self.embedding = nn.Embedding(vocab_size, d_model)
           self.pos_encoding = nn.Parameter(torch.randn(1000, d_model))
           
           self.transformer = nn.TransformerEncoder(
               nn.TransformerEncoderLayer(d_model, nhead, batch_first=True),
               num_layers
           )
           
           # Task-specific heads
           self.classifier = nn.Linear(d_model, 2)      # Sentiment analysis
           self.ner_classifier = nn.Linear(d_model, 9)  # Named entity recognition
           self.qa_classifier = nn.Linear(d_model, 2)   # Question answering

       def forward(self, input_ids, attention_mask, task_type):
           seq_len = input_ids.size(1)
           x = self.embedding(input_ids) * math.sqrt(self.d_model)
           x = x + self.pos_encoding[:seq_len, :].unsqueeze(0)
           
           x = self.transformer(x, src_key_padding_mask=attention_mask)
           
           # Use [CLS] token for classification tasks
           cls_output = x[:, 0, :]
           
           if task_type == 'classification':
               return self.classifier(cls_output)
           elif task_type == 'ner':
               return self.ner_classifier(x)
           elif task_type == 'qa':
               return self.qa_classifier(cls_output)

   model = MultiTaskNLPModel(vocab_size=30000)

   # Use PCGrad for multi-task learning
   optimizer = torchium.optimizers.PCGrad(
       model.parameters(),
       lr=1e-3
   )

   # Multi-task loss with uncertainty weighting
   class MultiTaskNLPLoss(nn.Module):
       def __init__(self):
           super().__init__()
           self.uncertainty_loss = torchium.losses.UncertaintyWeightingLoss(num_tasks=3)
           self.cls_loss = torchium.losses.CrossEntropyLoss()
           self.ner_loss = torchium.losses.CRFLoss(num_tags=9, batch_first=True)
           self.qa_loss = torchium.losses.CrossEntropyLoss()

       def forward(self, cls_pred, ner_pred, qa_pred, cls_target, ner_target, qa_target, attention_mask):
           cls_loss = self.cls_loss(cls_pred, cls_target)
           ner_loss = self.ner_loss(ner_pred, ner_target, attention_mask)
           qa_loss = self.qa_loss(qa_pred, qa_target)
           
           return self.uncertainty_loss([cls_loss, ner_loss, qa_loss])

   criterion = MultiTaskNLPLoss()

   # Training loop for multi-task NLP
   def train_multitask_nlp(model, optimizer, criterion, dataloader, num_epochs=100):
       model.train()
       
       for epoch in range(num_epochs):
           total_loss = 0
           
           for batch in dataloader:
               optimizer.zero_grad()
               
               # Forward pass for each task
               cls_pred = model(batch.input_ids, batch.attention_mask, 'classification')
               ner_pred = model(batch.input_ids, batch.attention_mask, 'ner')
               qa_pred = model(batch.input_ids, batch.attention_mask, 'qa')
               
               # Compute loss
               loss = criterion(cls_pred, ner_pred, qa_pred, 
                               batch.cls_targets, batch.ner_targets, batch.qa_targets, 
                               batch.attention_mask)
               
               # Backward pass with gradient surgery
               loss.backward()
               optimizer.step()
               
               total_loss += loss.item()
           
           avg_loss = total_loss / len(dataloader)
           print(f'Epoch {epoch}, Loss: {avg_loss:.4f}')

These examples demonstrate the power of Torchium's specialized optimizers and loss functions for various NLP tasks. Each example shows how to combine different components effectively for optimal performance in natural language processing.