Python Generators and Iterators
Generators are a powerful way to create lazy iterators — sequences that produce values one at a time, on demand, rather than storing everything in memory at once.
Iterators Recap
An iterator is any object that implements __iter__() and __next__():
# Lists are iterables (not iterators)
numbers = [1, 2, 3]
# Get an iterator from an iterable
it = iter(numbers)
print(next(it)) # 1
print(next(it)) # 2
print(next(it)) # 3
# print(next(it)) # StopIteration errorCreating a Custom Iterator
class Countdown:
"""Iterator that counts down from n to 1."""
def __init__(self, n):
self.n = n
def __iter__(self):
return self
def __next__(self):
if self.n <= 0:
raise StopIteration
value = self.n
self.n -= 1
return value
for num in Countdown(5):
print(num, end=" ")
# 5 4 3 2 1Generator Functions
A generator function uses yield instead of return:
def countdown(n):
"""Generator that counts down from n to 1."""
while n > 0:
yield n
n -= 1
# Call returns a generator object (doesn't execute yet)
gen = countdown(5)
print(type(gen)) # <class 'generator'>
# Values produced on demand
print(next(gen)) # 5
print(next(gen)) # 4
# Use in loops
for num in countdown(3):
print(num, end=" ")
# 3 2 1How yield Works
def simple_gen():
print("Step 1")
yield 1
print("Step 2")
yield 2
print("Step 3")
yield 3
print("Done")
gen = simple_gen()
print(next(gen))
# Step 1
# 1
print(next(gen))
# Step 2
# 2
print(next(gen))
# Step 3
# 3
# next(gen) would print "Done" then raise StopIterationEach yield pauses the function, saving its state. The next call to next() resumes from where it left off.
Generator Expressions
Compact syntax for simple generators (like list comprehensions, but lazy):
# List comprehension (creates entire list in memory)
squares_list = [x**2 for x in range(1000000)]
# Generator expression (creates values on demand)
squares_gen = (x**2 for x in range(1000000))
# Generator uses almost no memory!
import sys
print(sys.getsizeof(squares_list)) # ~8 MB
print(sys.getsizeof(squares_gen)) # ~200 bytes
# Use in functions that accept iterables
total = sum(x**2 for x in range(100))
print(total) # 328350
max_val = max(x**2 for x in range(100))
print(max_val) # 9801Why Use Generators?
1. Memory Efficiency
# BAD: loads entire file into memory
def read_all_lines(filename):
with open(filename) as f:
return f.readlines() # Could use GBs of memory
# GOOD: yields one line at a time
def read_lines(filename):
with open(filename) as f:
for line in f:
yield line.strip()
# Process a huge file with minimal memory
for line in read_lines("huge_file.txt"):
if "ERROR" in line:
print(line)2. Infinite Sequences
def fibonacci():
"""Infinite Fibonacci sequence."""
a, b = 0, 1
while True:
yield a
a, b = b, a + b
# Take only what you need
from itertools import islice
first_10 = list(islice(fibonacci(), 10))
print(first_10)
# [0, 1, 1, 2, 3, 5, 8, 13, 21, 34]
# Find first Fibonacci number over 1000
for fib in fibonacci():
if fib > 1000:
print(fib) # 1597
break3. Pipeline Processing
def read_lines(filename):
with open(filename) as f:
for line in f:
yield line.strip()
def filter_comments(lines):
for line in lines:
if not line.startswith("#"):
yield line
def parse_csv(lines):
for line in lines:
yield line.split(",")
def extract_names(rows):
for row in rows:
if len(row) >= 2:
yield row[1].strip()
# Chain generators into a pipeline
# Each step processes ONE item at a time
pipeline = extract_names(
parse_csv(
filter_comments(
read_lines("data.csv")
)
)
)
for name in pipeline:
print(name)yield from
Delegate to a sub-generator:
def flatten(nested_list):
"""Flatten a nested list."""
for item in nested_list:
if isinstance(item, list):
yield from flatten(item) # Delegate to recursive call
else:
yield item
data = [1, [2, 3], [4, [5, 6]], 7]
print(list(flatten(data)))
# [1, 2, 3, 4, 5, 6, 7]
# Without yield from (equivalent but verbose)
def flatten_verbose(nested_list):
for item in nested_list:
if isinstance(item, list):
for sub_item in flatten_verbose(item):
yield sub_item
else:
yield itemThe itertools Module
Python's itertools module provides powerful iterator building blocks:
import itertools
# count: infinite counter
for i in itertools.count(10, 2):
if i > 20:
break
print(i, end=" ") # 10 12 14 16 18 20
# cycle: repeat an iterable forever
colors = itertools.cycle(["red", "green", "blue"])
for _, color in zip(range(6), colors):
print(color, end=" ")
# red green blue red green blue
# chain: concatenate iterables
combined = itertools.chain([1, 2], [3, 4], [5, 6])
print(list(combined)) # [1, 2, 3, 4, 5, 6]
# islice: slice an iterator
print(list(itertools.islice(range(100), 5, 10)))
# [5, 6, 7, 8, 9]
# groupby: group consecutive elements
data = [("A", 1), ("A", 2), ("B", 3), ("B", 4), ("A", 5)]
for key, group in itertools.groupby(data, key=lambda x: x[0]):
print(f"{key}: {list(group)}")
# A: [('A', 1), ('A', 2)]
# B: [('B', 3), ('B', 4)]
# A: [('A', 5)]
# product: cartesian product
for combo in itertools.product("AB", "12"):
print(combo, end=" ")
# ('A', '1') ('A', '2') ('B', '1') ('B', '2')
# permutations and combinations
print(list(itertools.permutations("ABC", 2)))
# [('A','B'), ('A','C'), ('B','A'), ('B','C'), ('C','A'), ('C','B')]
print(list(itertools.combinations("ABC", 2)))
# [('A', 'B'), ('A', 'C'), ('B', 'C')]send() and Generator Communication
Generators can receive values via send():
def running_average():
"""Generator that computes a running average."""
total = 0
count = 0
average = None
while True:
value = yield average
if value is not None:
total += value
count += 1
average = total / count
avg = running_average()
next(avg) # Initialize (advance to first yield)
print(avg.send(10)) # 10.0
print(avg.send(20)) # 15.0
print(avg.send(30)) # 20.0
print(avg.send(40)) # 25.0Practical Example: Data Stream Processor
"""
Process a stream of data with generators.
"""
import itertools
from collections import deque
def generate_data():
"""Simulate incoming data points."""
import random
while True:
yield {
"timestamp": random.randint(1000, 9999),
"value": random.uniform(0, 100),
"sensor": random.choice(["A", "B", "C"])
}
def filter_sensor(data_stream, sensor_id):
"""Filter data by sensor ID."""
for point in data_stream:
if point["sensor"] == sensor_id:
yield point
def moving_average(data_stream, window_size=5):
"""Calculate moving average over a window."""
window = deque(maxlen=window_size)
for point in data_stream:
window.append(point["value"])
if len(window) == window_size:
avg = sum(window) / window_size
yield {**point, "avg": round(avg, 2)}
def detect_anomalies(data_stream, threshold=70):
"""Flag data points above threshold."""
for point in data_stream:
if point["avg"] > threshold:
yield {**point, "anomaly": True}
# Build processing pipeline
raw = generate_data()
sensor_a = filter_sensor(raw, "A")
averaged = moving_average(sensor_a, window_size=3)
anomalies = detect_anomalies(averaged, threshold=60)
# Process first 5 anomalies
for point in itertools.islice(anomalies, 5):
print(f"Anomaly! Sensor {point['sensor']} "
f"avg={point['avg']} value={point['value']:.1f}")Summary
- Iterators implement
__iter__()and__next__()for sequential access - Generator functions use
yieldto produce values lazily, one at a time - Generator expressions:
(expr for x in iterable)— lazy version of list comprehensions - Generators are memory-efficient for large datasets and infinite sequences
- Use
yield fromto delegate to sub-generators itertoolsprovides powerful tools:chain,islice,groupby,product, etc.- Generators support pipeline processing — chaining transformations efficiently
- Use
send()to communicate values back into a generator
Next, we'll learn about regular expressions in Python.