Advanced Expert 105 min read

Chapter 16: Efficient Transformer Architectures

Flash attention, MoE, state space models, and inference optimization.

Libraries covered: PyTorch Hugging Face Transformers

Learning Objectives

["Understand efficient attention", "Apply quantization", "Optimize inference"]

16.1 Prompt Engineering Intermediate

Prompt Engineering

Prompt engineering is the art and science of crafting inputs that elicit desired behaviors from language models. Effective prompts can dramatically improve model performance without any parameter changes.

Basic Prompt Structure

Well-structured prompts provide clear context, instructions, and output format specifications.

PYTHON

class PromptTemplate:
    def __init__(self, template):
        self.template = template

    def format(self, **kwargs):
        return self.template.format(**kwargs)


class StructuredPrompt:
    def __init__(self):
        self.sections = []

    def add_system(self, content):
        self.sections.append(("system", content))
        return self

    def add_context(self, content):
        self.sections.append(("context", content))
        return self

    def add_instruction(self, content):
        self.sections.append(("instruction", content))
        return self

    def add_examples(self, examples):
        self.sections.append(("examples", examples))
        return self

    def add_output_format(self, format_spec):
        self.sections.append(("format", format_spec))
        return self

    def build(self):
        prompt = ""
        for section_type, content in self.sections:
            if section_type == "system":
                prompt += f"System: {content}\n\n"
            elif section_type == "context":
                prompt += f"Context:\n{content}\n\n"
            elif section_type == "instruction":
                prompt += f"Task: {content}\n\n"
            elif section_type == "examples":
                prompt += "Examples:\n"
                for inp, out in content:
                    prompt += f"Input: {inp}\nOutput: {out}\n\n"
            elif section_type == "format":
                prompt += f"Output format: {content}\n\n"
        return prompt


# Example usage
prompt = (
    StructuredPrompt()
    .add_system("You are a helpful assistant that extracts structured data.")
    .add_instruction("Extract the person's name, age, and occupation from the text.")
    .add_examples([
        ("John Smith is a 35-year-old engineer.", '{"name": "John Smith", "age": 35, "occupation": "engineer"}'),
        ("Dr. Jane Doe, 42, works as a surgeon.", '{"name": "Jane Doe", "age": 42, "occupation": "surgeon"}')
    ])
    .add_output_format("JSON object with keys: name, age, occupation")
    .build()
)

Few-Shot Prompting

Providing examples helps models understand the desired task format and style.

PYTHON

class FewShotPrompt:
    def __init__(self, task_description):
        self.task_description = task_description
        self.examples = []

    def add_example(self, input_text, output_text):
        self.examples.append((input_text, output_text))
        return self

    def build_prompt(self, query):
        prompt = f"{self.task_description}\n\n"

        for i, (inp, out) in enumerate(self.examples, 1):
            prompt += f"Example {i}:\n"
            prompt += f"Input: {inp}\n"
            prompt += f"Output: {out}\n\n"

        prompt += f"Now complete this:\nInput: {query}\nOutput:"
        return prompt


class DynamicFewShot:
    def __init__(self, example_bank, embedder, k=3):
        self.example_bank = example_bank
        self.embedder = embedder
        self.k = k

    def select_examples(self, query):
        query_embedding = self.embedder.encode(query)
        similarities = []

        for example in self.example_bank:
            example_embedding = self.embedder.encode(example["input"])
            sim = self.cosine_similarity(query_embedding, example_embedding)
            similarities.append((sim, example))

        similarities.sort(reverse=True)
        return [ex for _, ex in similarities[:self.k]]

    def cosine_similarity(self, a, b):
        return sum(x * y for x, y in zip(a, b)) / (
            sum(x**2 for x in a)**0.5 * sum(y**2 for y in b)**0.5
        )

    def build_prompt(self, query):
        selected = self.select_examples(query)
        prompt = "Complete the task based on these examples:\n\n"

        for ex in selected:
            prompt += f"Input: {ex['input']}\nOutput: {ex['output']}\n\n"

        prompt += f"Input: {query}\nOutput:"
        return prompt

Chain-of-Thought Prompting

Encouraging step-by-step reasoning improves performance on complex tasks.

PYTHON

class ChainOfThoughtPrompt:
    COT_TRIGGER = "Let's think step by step."

    def __init__(self, task_type="reasoning"):
        self.task_type = task_type

    def build_zero_shot_cot(self, question):
        return f"{question}\n\n{self.COT_TRIGGER}"

    def build_few_shot_cot(self, question, examples):
        prompt = "Solve each problem by thinking step by step.\n\n"

        for q, reasoning, answer in examples:
            prompt += f"Question: {q}\n"
            prompt += f"Reasoning: {reasoning}\n"
            prompt += f"Answer: {answer}\n\n"

        prompt += f"Question: {question}\nReasoning:"
        return prompt


class SelfConsistency:
    def __init__(self, model, num_samples=5, temperature=0.7):
        self.model = model
        self.num_samples = num_samples
        self.temperature = temperature

    def solve(self, prompt):
        answers = []

        for _ in range(self.num_samples):
            response = self.model.generate(
                prompt,
                temperature=self.temperature
            )
            answer = self.extract_answer(response)
            answers.append(answer)

        return self.majority_vote(answers)

    def extract_answer(self, response):
        lines = response.strip().split("\n")
        for line in reversed(lines):
            if line.startswith("Answer:"):
                return line.replace("Answer:", "").strip()
        return lines[-1]

    def majority_vote(self, answers):
        from collections import Counter
        counts = Counter(answers)
        return counts.most_common(1)[0][0]

Structured Output Prompting

Guiding models to produce specific output formats improves reliability.

PYTHON

import json

class JSONPrompt:
    def __init__(self, schema):
        self.schema = schema

    def build_prompt(self, task, input_text):
        schema_str = json.dumps(self.schema, indent=2)
        return f"""{task}

Input: {input_text}

Respond with a JSON object matching this schema:
{schema_str}

JSON response:"""

    def parse_response(self, response):
        try:
            start = response.find("{")
            end = response.rfind("}") + 1
            json_str = response[start:end]
            return json.loads(json_str)
        except (json.JSONDecodeError, ValueError):
            return None


class XMLPrompt:
    def __init__(self, tags):
        self.tags = tags

    def build_prompt(self, task, input_text):
        tag_list = ", ".join(self.tags)
        return f"""{task}

Input: {input_text}

Respond using these XML tags: {tag_list}
Wrap each piece of information in the appropriate tag.

Response:"""

    def parse_response(self, response):
        import re
        result = {}
        for tag in self.tags:
            pattern = f"<{tag}>(.*?)</{tag}>"
            match = re.search(pattern, response, re.DOTALL)
            if match:
                result[tag] = match.group(1).strip()
        return result

Role and Persona Prompting

Assigning roles shapes model behavior and expertise level.

PYTHON

class PersonaPrompt:
    PERSONAS = {
        "expert": "You are a world-renowned expert with decades of experience.",
        "teacher": "You are a patient teacher who explains concepts clearly.",
        "critic": "You are a thorough critic who identifies flaws and improvements.",
        "assistant": "You are a helpful assistant focused on practical solutions."
    }

    def __init__(self, persona, domain=None):
        self.persona = persona
        self.domain = domain

    def build_system_prompt(self):
        base = self.PERSONAS.get(self.persona, self.persona)
        if self.domain:
            base += f" Your expertise is in {self.domain}."
        return base

    def build_full_prompt(self, task):
        system = self.build_system_prompt()
        return f"System: {system}\n\nUser: {task}\n\nAssistant:"


class MultiPersonaDebate:
    def __init__(self, personas, rounds=3):
        self.personas = personas
        self.rounds = rounds

    def build_debate_prompt(self, topic, previous_arguments=None):
        prompt = f"Topic: {topic}\n\n"

        if previous_arguments:
            prompt += "Previous arguments:\n"
            for persona, argument in previous_arguments:
                prompt += f"{persona}: {argument}\n\n"

        prompt += "Provide your perspective, addressing previous points if any:"
        return prompt

Key Takeaways

Prompt engineering is essential for effective LLM use. Structured prompts with clear sections improve reliability. Few-shot examples guide model behavior without training. Chain-of-thought prompting enhances reasoning tasks. Output format specifications ensure parseable responses. Role prompting shapes expertise and communication style.

16.2 Fine-tuning LLMs Advanced

Fine-tuning LLMs

Fine-tuning adapts pre-trained language models to specific tasks or domains. Modern techniques enable efficient adaptation without updating all model parameters.

Full Fine-tuning

Traditional fine-tuning updates all model parameters on task-specific data.

PYTHON

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

class FullFineTuner:
    def __init__(self, model, tokenizer, learning_rate=2e-5):
        self.model = model
        self.tokenizer = tokenizer
        self.optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=learning_rate,
            weight_decay=0.01
        )

    def prepare_batch(self, texts, labels):
        encodings = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        return encodings, torch.tensor(labels)

    def train_epoch(self, dataloader):
        self.model.train()
        total_loss = 0

        for batch in dataloader:
            self.optimizer.zero_grad()

            inputs, labels = batch
            outputs = self.model(**inputs, labels=labels)
            loss = outputs.loss

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            self.optimizer.step()

            total_loss += loss.item()

        return total_loss / len(dataloader)


class InstructionTuningDataset:
    def __init__(self, data, tokenizer, max_length=2048):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = self.format_instruction(item)

        encoding = self.tokenizer(
            prompt,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )

        labels = encoding["input_ids"].clone()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": labels.squeeze()
        }

    def format_instruction(self, item):
        return f"""### Instruction:
{item['instruction']}

### Input:
{item.get('input', '')}

### Response:
{item['output']}"""

LoRA: Low-Rank Adaptation

LoRA adds trainable low-rank matrices to frozen pre-trained weights.

PYTHON

import torch
import torch.nn as nn
import math

class LoRALayer(nn.Module):
    def __init__(self, in_features, out_features, rank=8, alpha=16):
        super().__init__()
        self.rank = rank
        self.alpha = alpha
        self.scaling = alpha / rank

        self.lora_A = nn.Parameter(torch.zeros(in_features, rank))
        self.lora_B = nn.Parameter(torch.zeros(rank, out_features))

        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        nn.init.zeros_(self.lora_B)

    def forward(self, x):
        return (x @ self.lora_A @ self.lora_B) * self.scaling


class LoRALinear(nn.Module):
    def __init__(self, original_layer, rank=8, alpha=16):
        super().__init__()
        self.original = original_layer
        self.original.weight.requires_grad = False

        in_features = original_layer.in_features
        out_features = original_layer.out_features

        self.lora = LoRALayer(in_features, out_features, rank, alpha)

    def forward(self, x):
        return self.original(x) + self.lora(x)


def apply_lora_to_model(model, rank=8, alpha=16, target_modules=None):
    if target_modules is None:
        target_modules = ["q_proj", "v_proj"]

    for name, module in model.named_modules():
        if any(target in name for target in target_modules):
            if isinstance(module, nn.Linear):
                parent_name = ".".join(name.split(".")[:-1])
                child_name = name.split(".")[-1]
                parent = model.get_submodule(parent_name)
                setattr(parent, child_name, LoRALinear(module, rank, alpha))

    return model


def count_trainable_params(model):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"Trainable: {trainable:,} ({100*trainable/total:.2f}%)")
    print(f"Total: {total:,}")
    return trainable, total

QLoRA: Quantized LoRA

QLoRA combines 4-bit quantization with LoRA for memory-efficient fine-tuning.

PYTHON

class QLoRAConfig:
    def __init__(
        self,
        lora_rank=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bits=4,
        double_quant=True,
        quant_type="nf4"
    ):
        self.lora_rank = lora_rank
        self.lora_alpha = lora_alpha
        self.lora_dropout = lora_dropout
        self.bits = bits
        self.double_quant = double_quant
        self.quant_type = quant_type


class NF4Quantizer:
    NF4_LEVELS = [
        -1.0, -0.6962, -0.5251, -0.3949, -0.2844, -0.1848, -0.0911, 0.0,
        0.0796, 0.1609, 0.2461, 0.3379, 0.4407, 0.5626, 0.7230, 1.0
    ]

    def __init__(self, block_size=64):
        self.block_size = block_size
        self.levels = torch.tensor(self.NF4_LEVELS)

    def quantize(self, weights):
        original_shape = weights.shape
        weights = weights.view(-1, self.block_size)

        scales = weights.abs().max(dim=1, keepdim=True)[0]
        normalized = weights / (scales + 1e-8)

        distances = (normalized.unsqueeze(-1) - self.levels).abs()
        indices = distances.argmin(dim=-1)

        return indices.to(torch.uint8), scales, original_shape

    def dequantize(self, indices, scales, original_shape):
        values = self.levels[indices.long()]
        weights = values * scales
        return weights.view(original_shape)


class QLoRALinear(nn.Module):
    def __init__(self, in_features, out_features, rank=64, alpha=16, bits=4):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features

        self.quantizer = NF4Quantizer()
        self.register_buffer("weight_indices", None)
        self.register_buffer("weight_scales", None)

        self.lora_A = nn.Parameter(torch.zeros(in_features, rank))
        self.lora_B = nn.Parameter(torch.zeros(rank, out_features))
        self.scaling = alpha / rank

        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        nn.init.zeros_(self.lora_B)

    def set_quantized_weights(self, weights):
        indices, scales, _ = self.quantizer.quantize(weights)
        self.weight_indices = indices
        self.weight_scales = scales

    def forward(self, x):
        base_weight = self.quantizer.dequantize(
            self.weight_indices,
            self.weight_scales,
            (self.out_features, self.in_features)
        )
        base_out = x @ base_weight.T
        lora_out = (x @ self.lora_A @ self.lora_B) * self.scaling
        return base_out + lora_out

Instruction Tuning

Training models to follow instructions across diverse tasks.

PYTHON

class InstructionTuner:
    TASK_TEMPLATES = {
        "summarize": "Summarize the following text:\n{input}\n\nSummary:",
        "translate": "Translate to {target_lang}:\n{input}\n\nTranslation:",
        "qa": "Answer based on the context.\nContext: {context}\nQuestion: {question}\nAnswer:",
        "classify": "Classify the sentiment of this text as positive, negative, or neutral:\n{input}\nSentiment:",
    }

    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def create_training_example(self, task_type, inputs, output):
        template = self.TASK_TEMPLATES.get(task_type, "{input}")
        prompt = template.format(**inputs)
        full_text = f"{prompt} {output}"
        return full_text

    def prepare_multitask_dataset(self, datasets):
        all_examples = []
        for task_name, task_data in datasets.items():
            for item in task_data:
                example = self.create_training_example(
                    task_name,
                    item["inputs"],
                    item["output"]
                )
                all_examples.append({
                    "text": example,
                    "task": task_name
                })
        return all_examples


class ChatFineTuner:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def format_conversation(self, messages):
        formatted = ""
        for msg in messages:
            role = msg["role"]
            content = msg["content"]
            if role == "system":
                formatted += f"<|system|>\n{content}</s>\n"
            elif role == "user":
                formatted += f"<|user|>\n{content}</s>\n"
            elif role == "assistant":
                formatted += f"<|assistant|>\n{content}</s>\n"
        return formatted

    def create_training_batch(self, conversations):
        texts = [self.format_conversation(conv) for conv in conversations]
        return self.tokenizer(texts, padding=True, return_tensors="pt")

Key Takeaways

Fine-tuning adapts LLMs to specific tasks and domains. Full fine-tuning updates all parameters but requires significant compute. LoRA adds small trainable adapters while keeping base weights frozen. QLoRA combines quantization with LoRA for memory efficiency. Instruction tuning creates models that follow diverse task instructions. These techniques enable customization of large models on limited hardware.

16.3 RLHF and Alignment Advanced

RLHF and Alignment

Reinforcement Learning from Human Feedback (RLHF) aligns language models with human preferences. This process transforms capable but unpredictable models into helpful, harmless, and honest assistants.

Reward Model Training

A reward model learns to predict human preferences from comparison data.

PYTHON

import torch
import torch.nn as nn
import torch.nn.functional as F

class RewardModel(nn.Module):
    def __init__(self, base_model, hidden_size):
        super().__init__()
        self.base_model = base_model
        self.reward_head = nn.Linear(hidden_size, 1)

        for param in self.base_model.parameters():
            param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )
        last_hidden = outputs.hidden_states[-1]
        last_token_hidden = last_hidden[:, -1, :]
        reward = self.reward_head(last_token_hidden)
        return reward.squeeze(-1)


class RewardModelTrainer:
    def __init__(self, model, learning_rate=1e-5):
        self.model = model
        self.optimizer = torch.optim.AdamW(
            model.reward_head.parameters(),
            lr=learning_rate
        )

    def compute_preference_loss(self, chosen_rewards, rejected_rewards):
        return -F.logsigmoid(chosen_rewards - rejected_rewards).mean()

    def train_step(self, chosen_batch, rejected_batch):
        self.optimizer.zero_grad()

        chosen_rewards = self.model(
            chosen_batch["input_ids"],
            chosen_batch["attention_mask"]
        )
        rejected_rewards = self.model(
            rejected_batch["input_ids"],
            rejected_batch["attention_mask"]
        )

        loss = self.compute_preference_loss(chosen_rewards, rejected_rewards)
        loss.backward()
        self.optimizer.step()

        accuracy = (chosen_rewards > rejected_rewards).float().mean()
        return loss.item(), accuracy.item()

PPO Training

Proximal Policy Optimization fine-tunes the model using reward signals.

PYTHON

class PPOTrainer:
    def __init__(self, policy_model, ref_model, reward_model, tokenizer,
                 kl_coef=0.1, clip_range=0.2, learning_rate=1e-6):
        self.policy = policy_model
        self.ref = ref_model
        self.reward = reward_model
        self.tokenizer = tokenizer
        self.kl_coef = kl_coef
        self.clip_range = clip_range

        self.optimizer = torch.optim.AdamW(
            self.policy.parameters(), lr=learning_rate
        )

    def compute_rewards(self, prompts, responses):
        full_texts = [p + r for p, r in zip(prompts, responses)]
        encodings = self.tokenizer(full_texts, padding=True, return_tensors="pt")
        rewards = self.reward(**encodings)
        return rewards

    def ppo_step(self, batch):
        prompts = batch["prompts"]

        with torch.no_grad():
            responses, old_logprobs = self.generate_with_logprobs(prompts)
            ref_logprobs = self.compute_ref_logprobs(prompts, responses)
            rewards = self.compute_rewards(prompts, responses)

        for _ in range(4):
            new_logprobs = self.compute_policy_logprobs(prompts, responses)
            ratio = torch.exp(new_logprobs - old_logprobs)
            kl_penalty = new_logprobs - ref_logprobs
            adjusted_rewards = rewards - self.kl_coef * kl_penalty

            advantages = adjusted_rewards - adjusted_rewards.mean()
            advantages = advantages / (advantages.std() + 1e-8)

            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1 - self.clip_range, 1 + self.clip_range) * advantages
            policy_loss = -torch.min(surr1, surr2).mean()

            self.optimizer.zero_grad()
            policy_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1.0)
            self.optimizer.step()

        return {"loss": policy_loss.item(), "reward": rewards.mean().item()}

Direct Preference Optimization

DPO simplifies RLHF by directly optimizing preferences without a reward model.

PYTHON

class DPOTrainer:
    def __init__(self, policy_model, ref_model, tokenizer, beta=0.1):
        self.policy = policy_model
        self.ref = ref_model
        self.tokenizer = tokenizer
        self.beta = beta
        self.optimizer = torch.optim.AdamW(self.policy.parameters(), lr=1e-6)

    def compute_logprobs(self, model, input_ids, attention_mask, labels):
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits[:, :-1, :]
        labels = labels[:, 1:]

        log_probs = F.log_softmax(logits, dim=-1)
        selected_log_probs = torch.gather(
            log_probs, 2, labels.unsqueeze(-1)
        ).squeeze(-1)

        mask = (labels != -100).float()
        return (selected_log_probs * mask).sum(dim=1) / mask.sum(dim=1)

    def dpo_loss(self, chosen_batch, rejected_batch):
        with torch.no_grad():
            ref_chosen = self.compute_logprobs(
                self.ref, chosen_batch["input_ids"],
                chosen_batch["attention_mask"], chosen_batch["labels"]
            )
            ref_rejected = self.compute_logprobs(
                self.ref, rejected_batch["input_ids"],
                rejected_batch["attention_mask"], rejected_batch["labels"]
            )

        policy_chosen = self.compute_logprobs(
            self.policy, chosen_batch["input_ids"],
            chosen_batch["attention_mask"], chosen_batch["labels"]
        )
        policy_rejected = self.compute_logprobs(
            self.policy, rejected_batch["input_ids"],
            rejected_batch["attention_mask"], rejected_batch["labels"]
        )

        chosen_logratios = policy_chosen - ref_chosen
        rejected_logratios = policy_rejected - ref_rejected

        loss = -F.logsigmoid(self.beta * (chosen_logratios - rejected_logratios)).mean()
        return loss

    def train_step(self, chosen_batch, rejected_batch):
        self.optimizer.zero_grad()
        loss = self.dpo_loss(chosen_batch, rejected_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1.0)
        self.optimizer.step()
        return loss.item()

Constitutional AI

Constitutional AI uses self-critique and revision guided by principles.

PYTHON

class ConstitutionalAI:
    def __init__(self, model, tokenizer, principles):
        self.model = model
        self.tokenizer = tokenizer
        self.principles = principles

    def generate_initial_response(self, prompt):
        return self.model.generate(f"Human: {prompt}\n\nAssistant:")

    def critique_response(self, prompt, response, principle):
        critique_prompt = f"""Consider the following response:

Response: {response}

Critique this response based on: {principle}

Critique:"""
        return self.model.generate(critique_prompt)

    def revise_response(self, prompt, response, critique):
        revision_prompt = f"""Original response: {response}

Critique: {critique}

Please provide an improved response that addresses the critique:

Revised response:"""
        return self.model.generate(revision_prompt)

    def constitutional_generation(self, prompt):
        response = self.generate_initial_response(prompt)

        for principle in self.principles:
            critique = self.critique_response(prompt, response, principle)
            response = self.revise_response(prompt, response, critique)

        return response


EXAMPLE_PRINCIPLES = [
    "Be helpful and provide accurate information",
    "Avoid harmful, unethical, or dangerous content",
    "Be honest about uncertainty and limitations",
    "Respect user privacy and confidentiality"
]

Key Takeaways

RLHF aligns LLMs with human values and preferences. Reward models learn to score responses based on human comparisons. PPO optimizes the policy while constraining divergence from the reference model. DPO simplifies alignment by directly optimizing preferences. Constitutional AI provides scalable self-improvement through principles. These techniques transform base models into helpful assistants.

16.4 Evaluation and Benchmarks Intermediate

Evaluation and Benchmarks

Evaluating LLMs is challenging because their capabilities span diverse tasks. Comprehensive evaluation requires multiple benchmarks, metrics, and testing methodologies.

Perplexity and Language Modeling Metrics

Perplexity measures how well a model predicts held-out text.

PYTHON

import torch
import torch.nn.functional as F
import math

class PerplexityCalculator:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def calculate_perplexity(self, texts, stride=512):
        self.model.eval()
        total_loss = 0
        total_tokens = 0

        for text in texts:
            encodings = self.tokenizer(text, return_tensors="pt")
            input_ids = encodings.input_ids
            seq_len = input_ids.size(1)

            nlls = []
            for i in range(0, seq_len, stride):
                begin = max(i + stride - 1024, 0)
                end = min(i + stride, seq_len)
                target_len = end - i

                input_slice = input_ids[:, begin:end]

                with torch.no_grad():
                    outputs = self.model(input_slice)
                    logits = outputs.logits

                shift_logits = logits[:, -target_len:-1, :]
                shift_labels = input_ids[:, i+1:end]

                loss = F.cross_entropy(
                    shift_logits.reshape(-1, shift_logits.size(-1)),
                    shift_labels.reshape(-1),
                    reduction="sum"
                )
                nlls.append(loss.item())
                total_tokens += target_len - 1

            total_loss += sum(nlls)

        perplexity = math.exp(total_loss / total_tokens)
        return perplexity


class BitsPerByte:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def calculate_bpb(self, text):
        num_bytes = len(text.encode("utf-8"))
        encodings = self.tokenizer(text, return_tensors="pt")

        with torch.no_grad():
            outputs = self.model(encodings.input_ids)
            logits = outputs.logits

        shift_logits = logits[:, :-1, :]
        shift_labels = encodings.input_ids[:, 1:]

        loss = F.cross_entropy(
            shift_logits.reshape(-1, shift_logits.size(-1)),
            shift_labels.reshape(-1)
        )

        nll_bits = loss.item() * shift_labels.numel() / math.log(2)
        bpb = nll_bits / num_bytes
        return bpb

Benchmark Suites

Standardized benchmarks enable model comparison across tasks.

PYTHON

class BenchmarkRunner:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def run_mmlu(self, dataset):
        correct = 0
        total = 0

        for item in dataset:
            question = item["question"]
            choices = item["choices"]
            answer = item["answer"]

            prompt = self.format_mmlu_prompt(question, choices)
            prediction = self.get_prediction(prompt, choices)

            if prediction == answer:
                correct += 1
            total += 1

        return correct / total

    def format_mmlu_prompt(self, question, choices):
        prompt = f"Question: {question}\n\n"
        for i, choice in enumerate(choices):
            label = chr(ord("A") + i)
            prompt += f"{label}. {choice}\n"
        prompt += "\nAnswer:"
        return prompt

    def get_prediction(self, prompt, choices):
        inputs = self.tokenizer(prompt, return_tensors="pt")

        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits[0, -1]

        choice_tokens = [self.tokenizer.encode(chr(ord("A") + i))[0]
                        for i in range(len(choices))]
        choice_logits = logits[choice_tokens]
        return choice_logits.argmax().item()


class HumanEvalRunner:
    def __init__(self, model, tokenizer, timeout=5):
        self.model = model
        self.tokenizer = tokenizer
        self.timeout = timeout

    def evaluate(self, problems, num_samples=1):
        results = []

        for problem in problems:
            prompt = problem["prompt"]
            test_cases = problem["test"]
            entry_point = problem["entry_point"]

            completions = self.generate_completions(prompt, num_samples)
            passed = any(
                self.run_tests(completion, test_cases, entry_point)
                for completion in completions
            )
            results.append(passed)

        pass_at_1 = sum(results) / len(results)
        return {"pass@1": pass_at_1}

    def generate_completions(self, prompt, num_samples):
        completions = []
        for _ in range(num_samples):
            inputs = self.tokenizer(prompt, return_tensors="pt")
            outputs = self.model.generate(**inputs, max_new_tokens=256)
            completion = self.tokenizer.decode(outputs[0])
            completions.append(completion)
        return completions

    def run_tests(self, code, test_cases, entry_point):
        try:
            exec_globals = {}
            exec(code, exec_globals)
            exec(test_cases, exec_globals)
            return True
        except Exception:
            return False

Generation Quality Metrics

Evaluating open-ended generation requires specialized metrics.

PYTHON

from collections import Counter
import math

class GenerationMetrics:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def bleu_score(self, reference, hypothesis, max_n=4):
        ref_tokens = reference.split()
        hyp_tokens = hypothesis.split()

        precisions = []
        for n in range(1, max_n + 1):
            ref_ngrams = Counter(self.get_ngrams(ref_tokens, n))
            hyp_ngrams = Counter(self.get_ngrams(hyp_tokens, n))

            overlap = sum((ref_ngrams & hyp_ngrams).values())
            total = sum(hyp_ngrams.values())

            if total > 0:
                precisions.append(overlap / total)
            else:
                precisions.append(0)

        if min(precisions) > 0:
            log_precision = sum(math.log(p) for p in precisions) / max_n
            bp = min(1, math.exp(1 - len(ref_tokens) / len(hyp_tokens)))
            return bp * math.exp(log_precision)
        return 0

    def get_ngrams(self, tokens, n):
        return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]

    def rouge_l(self, reference, hypothesis):
        ref_tokens = reference.split()
        hyp_tokens = hypothesis.split()

        lcs_length = self.lcs(ref_tokens, hyp_tokens)

        precision = lcs_length / len(hyp_tokens) if hyp_tokens else 0
        recall = lcs_length / len(ref_tokens) if ref_tokens else 0

        if precision + recall > 0:
            f1 = 2 * precision * recall / (precision + recall)
        else:
            f1 = 0

        return {"precision": precision, "recall": recall, "f1": f1}

    def lcs(self, seq1, seq2):
        m, n = len(seq1), len(seq2)
        dp = [[0] * (n + 1) for _ in range(m + 1)]

        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if seq1[i-1] == seq2[j-1]:
                    dp[i][j] = dp[i-1][j-1] + 1
                else:
                    dp[i][j] = max(dp[i-1][j], dp[i][j-1])

        return dp[m][n]


class DiversityMetrics:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def distinct_n(self, texts, n=2):
        all_ngrams = []
        total_tokens = 0

        for text in texts:
            tokens = text.split()
            total_tokens += len(tokens)
            ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
            all_ngrams.extend(ngrams)

        unique_ngrams = len(set(all_ngrams))
        return unique_ngrams / len(all_ngrams) if all_ngrams else 0

LLM-as-Judge Evaluation

Using LLMs to evaluate other LLM outputs.

PYTHON

class LLMJudge:
    def __init__(self, judge_model, tokenizer):
        self.judge = judge_model
        self.tokenizer = tokenizer

    def pairwise_comparison(self, prompt, response_a, response_b):
        judge_prompt = f"""Compare these two responses to the prompt.

Prompt: {prompt}

Response A: {response_a}

Response B: {response_b}

Which response is better? Answer with just "A" or "B".
Winner:"""

        inputs = self.tokenizer(judge_prompt, return_tensors="pt")
        outputs = self.judge.generate(**inputs, max_new_tokens=1)
        result = self.tokenizer.decode(outputs[0][-1])

        return "A" if "A" in result else "B"

    def score_response(self, prompt, response, criteria):
        judge_prompt = f"""Rate the following response on a scale of 1-5.

Prompt: {prompt}

Response: {response}

Criteria: {criteria}

Score (1-5):"""

        inputs = self.tokenizer(judge_prompt, return_tensors="pt")
        outputs = self.judge.generate(**inputs, max_new_tokens=1)
        result = self.tokenizer.decode(outputs[0][-1])

        try:
            return int(result.strip())
        except ValueError:
            return 3

Key Takeaways

LLM evaluation requires multiple complementary approaches. Perplexity measures language modeling but not task performance. Benchmark suites like MMLU and HumanEval test specific capabilities. Generation metrics like BLEU and ROUGE compare outputs to references. LLM-as-judge approaches scale human evaluation. Comprehensive evaluation combines automated metrics with human judgment.

16.5 Efficient Architectures and Optimization Advanced

Efficient Architectures and Optimization

Making LLMs faster and more memory-efficient is crucial for practical deployment. Various techniques reduce computational requirements while maintaining model quality.

Flash Attention

Flash Attention computes attention with reduced memory by avoiding materialization of the full attention matrix.

PYTHON

import torch
import torch.nn as nn
import math

class FlashAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, block_size=256):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.block_size = block_size
        self.scale = self.head_dim ** -0.5

    def forward(self, q, k, v, causal=True):
        batch, seq_len, _ = q.shape
        q = q.view(batch, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        output = torch.zeros_like(q)
        row_max = torch.full((batch, self.num_heads, seq_len), float("-inf"), device=q.device)
        row_sum = torch.zeros(batch, self.num_heads, seq_len, device=q.device)

        num_blocks = (seq_len + self.block_size - 1) // self.block_size

        for i in range(num_blocks):
            q_start = i * self.block_size
            q_end = min((i + 1) * self.block_size, seq_len)
            q_block = q[:, :, q_start:q_end, :]

            for j in range(num_blocks):
                if causal and j > i:
                    continue

                k_start = j * self.block_size
                k_end = min((j + 1) * self.block_size, seq_len)

                k_block = k[:, :, k_start:k_end, :]
                v_block = v[:, :, k_start:k_end, :]

                scores = torch.matmul(q_block, k_block.transpose(-2, -1)) * self.scale

                if causal and i == j:
                    mask = torch.triu(torch.ones(q_end - q_start, k_end - k_start, device=q.device), diagonal=1)
                    scores = scores.masked_fill(mask.bool(), float("-inf"))

                block_max = scores.max(dim=-1, keepdim=True)[0]
                exp_scores = torch.exp(scores - block_max)
                block_sum = exp_scores.sum(dim=-1, keepdim=True)

                new_max = torch.maximum(row_max[:, :, q_start:q_end].unsqueeze(-1), block_max)
                scale_old = torch.exp(row_max[:, :, q_start:q_end].unsqueeze(-1) - new_max)
                scale_new = torch.exp(block_max - new_max)

                output[:, :, q_start:q_end, :] = (
                    scale_old * output[:, :, q_start:q_end, :] +
                    scale_new * torch.matmul(exp_scores, v_block)
                )

                row_sum[:, :, q_start:q_end] = (
                    scale_old.squeeze(-1) * row_sum[:, :, q_start:q_end] +
                    scale_new.squeeze(-1) * block_sum.squeeze(-1)
                )
                row_max[:, :, q_start:q_end] = new_max.squeeze(-1)

        output = output / row_sum.unsqueeze(-1)
        return output.transpose(1, 2).reshape(batch, seq_len, self.embed_dim)

Grouped-Query Attention

GQA reduces memory by sharing key-value heads across query heads.

PYTHON

class GroupedQueryAttention(nn.Module):
    def __init__(self, embed_dim, num_q_heads, num_kv_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_q_heads = num_q_heads
        self.num_kv_heads = num_kv_heads
        self.head_dim = embed_dim // num_q_heads
        self.groups = num_q_heads // num_kv_heads

        self.q_proj = nn.Linear(embed_dim, num_q_heads * self.head_dim, bias=False)
        self.k_proj = nn.Linear(embed_dim, num_kv_heads * self.head_dim, bias=False)
        self.v_proj = nn.Linear(embed_dim, num_kv_heads * self.head_dim, bias=False)
        self.o_proj = nn.Linear(num_q_heads * self.head_dim, embed_dim, bias=False)

    def forward(self, x, mask=None):
        batch, seq_len, _ = x.shape

        q = self.q_proj(x).view(batch, seq_len, self.num_q_heads, self.head_dim)
        k = self.k_proj(x).view(batch, seq_len, self.num_kv_heads, self.head_dim)
        v = self.v_proj(x).view(batch, seq_len, self.num_kv_heads, self.head_dim)

        k = k.repeat_interleave(self.groups, dim=2)
        v = v.repeat_interleave(self.groups, dim=2)

        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)

        if mask is not None:
            scores = scores.masked_fill(mask, float("-inf"))

        attn = torch.softmax(scores, dim=-1)
        out = torch.matmul(attn, v)

        out = out.transpose(1, 2).reshape(batch, seq_len, -1)
        return self.o_proj(out)

Knowledge Distillation

Training smaller student models to mimic larger teacher models.

PYTHON

class DistillationTrainer:
    def __init__(self, teacher, student, tokenizer, temperature=2.0, alpha=0.5):
        self.teacher = teacher
        self.student = student
        self.tokenizer = tokenizer
        self.temperature = temperature
        self.alpha = alpha
        self.optimizer = torch.optim.AdamW(student.parameters(), lr=1e-4)

    def distillation_loss(self, student_logits, teacher_logits, labels):
        soft_targets = torch.softmax(teacher_logits / self.temperature, dim=-1)
        soft_predictions = torch.log_softmax(student_logits / self.temperature, dim=-1)

        distill_loss = torch.nn.functional.kl_div(
            soft_predictions,
            soft_targets,
            reduction="batchmean"
        ) * (self.temperature ** 2)

        hard_loss = torch.nn.functional.cross_entropy(
            student_logits.view(-1, student_logits.size(-1)),
            labels.view(-1),
            ignore_index=-100
        )

        return self.alpha * distill_loss + (1 - self.alpha) * hard_loss

    def train_step(self, batch):
        self.teacher.eval()
        self.student.train()

        input_ids = batch["input_ids"]
        labels = batch["labels"]

        with torch.no_grad():
            teacher_outputs = self.teacher(input_ids)
            teacher_logits = teacher_outputs.logits

        student_outputs = self.student(input_ids)
        student_logits = student_outputs.logits

        loss = self.distillation_loss(student_logits, teacher_logits, labels)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()

Pruning

Removing unnecessary weights to create sparser, faster models.

PYTHON

class MagnitudePruner:
    def __init__(self, model, sparsity=0.5):
        self.model = model
        self.sparsity = sparsity

    def compute_threshold(self, tensor):
        flat = tensor.abs().flatten()
        k = int(len(flat) * self.sparsity)
        threshold = flat.kthvalue(k).values
        return threshold

    def prune_linear_layers(self):
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear):
                threshold = self.compute_threshold(module.weight)
                mask = module.weight.abs() > threshold
                module.weight.data *= mask.float()

    def structured_prune_heads(self, num_heads_to_remove):
        for name, module in self.model.named_modules():
            if hasattr(module, "num_heads"):
                head_importance = self.compute_head_importance(module)
                heads_to_prune = head_importance.argsort()[:num_heads_to_remove]
                self.remove_heads(module, heads_to_prune)

    def compute_head_importance(self, attention_module):
        return torch.randn(attention_module.num_heads)

    def remove_heads(self, module, heads):
        pass


class SparseLinear(nn.Module):
    def __init__(self, in_features, out_features, sparsity=0.9):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features

        num_nonzero = int((1 - sparsity) * in_features * out_features)
        self.indices = nn.Parameter(torch.randint(0, in_features * out_features, (num_nonzero,)), requires_grad=False)
        self.values = nn.Parameter(torch.randn(num_nonzero) * 0.02)

    def forward(self, x):
        weight = torch.zeros(self.out_features, self.in_features, device=x.device)
        weight.view(-1)[self.indices] = self.values
        return torch.nn.functional.linear(x, weight)

Efficient Inference Serving

Optimizations for production deployment.

PYTHON

class EfficientServer:
    def __init__(self, model, tokenizer, max_batch_size=32):
        self.model = model
        self.tokenizer = tokenizer
        self.max_batch_size = max_batch_size
        self.request_queue = []

    def add_request(self, prompt, max_tokens):
        self.request_queue.append({
            "prompt": prompt,
            "max_tokens": max_tokens,
            "tokens": self.tokenizer.encode(prompt)
        })

    def process_batch(self):
        if not self.request_queue:
            return []

        batch = self.request_queue[:self.max_batch_size]
        self.request_queue = self.request_queue[self.max_batch_size:]

        max_len = max(len(r["tokens"]) for r in batch)
        input_ids = torch.zeros(len(batch), max_len, dtype=torch.long)

        for i, request in enumerate(batch):
            tokens = request["tokens"]
            input_ids[i, :len(tokens)] = torch.tensor(tokens)

        with torch.no_grad():
            output = self.model.generate(input_ids, max_new_tokens=batch[0]["max_tokens"])

        results = []
        for i, request in enumerate(batch):
            generated = self.tokenizer.decode(output[i])
            results.append(generated)

        return results

Key Takeaways

Efficient architectures make LLMs practical for deployment. Flash Attention reduces memory by computing attention blockwise. Grouped-Query Attention shares key-value heads to reduce cache size. Knowledge distillation transfers capabilities to smaller models. Pruning removes unnecessary weights for faster inference. These optimizations enable serving LLMs at scale with limited resources.