Introduction to Large Language Models
Large Language Models (LLMs) represent a paradigm shift in natural language processing. These models, with billions of parameters trained on massive text corpora, exhibit remarkable capabilities in understanding and generating human language.
What Makes a Model Large
Scale defines LLMs across three dimensions: parameters, training data, and compute.
import torch
import torch.nn as nn
def count_parameters(model):
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total:,}")
print(f"Trainable parameters: {trainable:,}")
print(f"Model size (FP32): {total * 4 / 1e9:.2f} GB")
print(f"Model size (FP16): {total * 2 / 1e9:.2f} GB")
return total
def estimate_parameters(n_layers, embed_dim, vocab_size=50000, ff_mult=4):
embedding_params = vocab_size * embed_dim
attention_params = 4 * embed_dim * embed_dim
ff_params = 2 * embed_dim * (ff_mult * embed_dim)
layer_norm_params = 4 * embed_dim
layer_params = attention_params + ff_params + layer_norm_params
total = embedding_params + (n_layers * layer_params) + embed_dim
return total
params_7b = estimate_parameters(n_layers=32, embed_dim=4096)
print(f"Estimated 7B model parameters: {params_7b / 1e9:.1f}B")Modern LLMs range from 7 billion to over 100 billion parameters. The compute required to train these models follows predictable scaling laws.
The Scaling Laws
Research has revealed predictable relationships between scale and performance.
import numpy as np
class ScalingLaws:
def __init__(self, E=1.69, A=406.4, B=410.7, alpha=0.34, beta=0.28):
self.E = E
self.A = A
self.B = B
self.alpha = alpha
self.beta = beta
def predict_loss(self, N, D):
return self.E + self.A / (N ** self.alpha) + self.B / (D ** self.beta)
def optimal_tokens(self, N):
return 20 * N # Chinchilla ratio
def compute_optimal_allocation(self, compute_budget):
N = np.sqrt(compute_budget / 120)
D = self.optimal_tokens(N)
return int(N), int(D)
scaling = ScalingLaws()
for N in [7e9, 13e9, 70e9]:
optimal_D = scaling.optimal_tokens(N)
loss = scaling.predict_loss(N, optimal_D)
print(f"{N/1e9:.0f}B model: {optimal_D/1e12:.1f}T tokens, Loss: {loss:.3f}")The Chinchilla scaling laws suggest training on approximately 20 tokens per parameter for compute-optimal training.
Architecture Overview
Modern LLMs use decoder-only transformer architectures with specific modifications.
class RMSNorm(nn.Module):
def __init__(self, dim, eps=1e-6):
super().__init__()
self.eps = eps
self.weight = nn.Parameter(torch.ones(dim))
def forward(self, x):
rms = torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + self.eps)
return x / rms * self.weight
class LLMBlock(nn.Module):
def __init__(self, embed_dim, num_heads, ff_mult=4):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
self.norm1 = RMSNorm(embed_dim)
self.norm2 = RMSNorm(embed_dim)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=False)
self.o_proj = nn.Linear(embed_dim, embed_dim, bias=False)
ff_dim = int(ff_mult * embed_dim * 2 / 3)
self.gate_proj = nn.Linear(embed_dim, ff_dim, bias=False)
self.up_proj = nn.Linear(embed_dim, ff_dim, bias=False)
self.down_proj = nn.Linear(ff_dim, embed_dim, bias=False)
def forward(self, x, mask=None):
h = self.norm1(x)
batch, seq_len, _ = h.shape
q = self.q_proj(h).view(batch, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
k = self.k_proj(h).view(batch, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
v = self.v_proj(h).view(batch, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
attn = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
if mask is not None:
attn = attn.masked_fill(mask, float("-inf"))
attn = torch.softmax(attn, dim=-1)
out = torch.matmul(attn, v).transpose(1, 2).reshape(batch, seq_len, self.embed_dim)
x = x + self.o_proj(out)
h = self.norm2(x)
x = x + self.down_proj(nn.functional.silu(self.gate_proj(h)) * self.up_proj(h))
return xKey innovations include RMSNorm for efficiency, SwiGLU activation for better gradients, and rotary position embeddings for length generalization.
Training Data Composition
LLMs train on diverse internet-scale text corpora with careful curation.
TRAINING_DATA_MIX = {
"web_crawl": 0.60, # Common Crawl, filtered
"code": 0.15, # GitHub repositories
"books": 0.08, # Book corpora
"conversations": 0.08, # Forums, Q&A sites
"scientific": 0.05, # arXiv papers
"wikipedia": 0.04, # Wikipedia dumps
}
def calculate_tokens_needed(params_billions, chinchilla_ratio=20):
tokens = params_billions * chinchilla_ratio * 1e9
print(f"{params_billions}B model needs {tokens/1e12:.1f}T tokens")
return tokens
for size in [7, 13, 70]:
calculate_tokens_needed(size)Data quality significantly impacts model capabilities. Filtering, deduplication, and careful mixing of data sources are essential preprocessing steps.
Key Takeaways
Large Language Models achieve capabilities through massive scale in parameters, data, and compute. Scaling laws predict performance improvements and guide resource allocation. Modern architectures incorporate RMSNorm, SwiGLU activation, and rotary embeddings. Training data diversity and quality significantly impact model capabilities.