Advanced Topics

Python Generators and Iterators

Learn how to create memory-efficient sequences with Python generators and iterators, including yield, generator expressions, and itertools.

Python Generators and Iterators

Generators are a powerful way to create lazy iterators — sequences that produce values one at a time, on demand, rather than storing everything in memory at once.


Iterators Recap

An iterator is any object that implements __iter__() and __next__():

python
# Lists are iterables (not iterators)
numbers = [1, 2, 3]

# Get an iterator from an iterable
it = iter(numbers)

print(next(it))  # 1
print(next(it))  # 2
print(next(it))  # 3
# print(next(it))  # StopIteration error

Creating a Custom Iterator

python
class Countdown:
    """Iterator that counts down from n to 1."""

    def __init__(self, n):
        self.n = n

    def __iter__(self):
        return self

    def __next__(self):
        if self.n <= 0:
            raise StopIteration
        value = self.n
        self.n -= 1
        return value


for num in Countdown(5):
    print(num, end=" ")
# 5 4 3 2 1

Generator Functions

A generator function uses yield instead of return:

python
def countdown(n):
    """Generator that counts down from n to 1."""
    while n > 0:
        yield n
        n -= 1

# Call returns a generator object (doesn't execute yet)
gen = countdown(5)
print(type(gen))  # <class 'generator'>

# Values produced on demand
print(next(gen))  # 5
print(next(gen))  # 4

# Use in loops
for num in countdown(3):
    print(num, end=" ")
# 3 2 1

How yield Works

python
def simple_gen():
    print("Step 1")
    yield 1
    print("Step 2")
    yield 2
    print("Step 3")
    yield 3
    print("Done")

gen = simple_gen()

print(next(gen))
# Step 1
# 1

print(next(gen))
# Step 2
# 2

print(next(gen))
# Step 3
# 3

# next(gen) would print "Done" then raise StopIteration

Each yield pauses the function, saving its state. The next call to next() resumes from where it left off.


Generator Expressions

Compact syntax for simple generators (like list comprehensions, but lazy):

python
# List comprehension (creates entire list in memory)
squares_list = [x**2 for x in range(1000000)]

# Generator expression (creates values on demand)
squares_gen = (x**2 for x in range(1000000))

# Generator uses almost no memory!
import sys
print(sys.getsizeof(squares_list))  # ~8 MB
print(sys.getsizeof(squares_gen))   # ~200 bytes

# Use in functions that accept iterables
total = sum(x**2 for x in range(100))
print(total)  # 328350

max_val = max(x**2 for x in range(100))
print(max_val)  # 9801

Why Use Generators?

1. Memory Efficiency

python
# BAD: loads entire file into memory
def read_all_lines(filename):
    with open(filename) as f:
        return f.readlines()  # Could use GBs of memory

# GOOD: yields one line at a time
def read_lines(filename):
    with open(filename) as f:
        for line in f:
            yield line.strip()

# Process a huge file with minimal memory
for line in read_lines("huge_file.txt"):
    if "ERROR" in line:
        print(line)

2. Infinite Sequences

python
def fibonacci():
    """Infinite Fibonacci sequence."""
    a, b = 0, 1
    while True:
        yield a
        a, b = b, a + b

# Take only what you need
from itertools import islice

first_10 = list(islice(fibonacci(), 10))
print(first_10)
# [0, 1, 1, 2, 3, 5, 8, 13, 21, 34]

# Find first Fibonacci number over 1000
for fib in fibonacci():
    if fib > 1000:
        print(fib)  # 1597
        break

3. Pipeline Processing

python
def read_lines(filename):
    with open(filename) as f:
        for line in f:
            yield line.strip()

def filter_comments(lines):
    for line in lines:
        if not line.startswith("#"):
            yield line

def parse_csv(lines):
    for line in lines:
        yield line.split(",")

def extract_names(rows):
    for row in rows:
        if len(row) >= 2:
            yield row[1].strip()

# Chain generators into a pipeline
# Each step processes ONE item at a time
pipeline = extract_names(
    parse_csv(
        filter_comments(
            read_lines("data.csv")
        )
    )
)

for name in pipeline:
    print(name)

yield from

Delegate to a sub-generator:

python
def flatten(nested_list):
    """Flatten a nested list."""
    for item in nested_list:
        if isinstance(item, list):
            yield from flatten(item)  # Delegate to recursive call
        else:
            yield item

data = [1, [2, 3], [4, [5, 6]], 7]
print(list(flatten(data)))
# [1, 2, 3, 4, 5, 6, 7]

# Without yield from (equivalent but verbose)
def flatten_verbose(nested_list):
    for item in nested_list:
        if isinstance(item, list):
            for sub_item in flatten_verbose(item):
                yield sub_item
        else:
            yield item

The itertools Module

Python's itertools module provides powerful iterator building blocks:

python
import itertools

# count: infinite counter
for i in itertools.count(10, 2):
    if i > 20:
        break
    print(i, end=" ")  # 10 12 14 16 18 20

# cycle: repeat an iterable forever
colors = itertools.cycle(["red", "green", "blue"])
for _, color in zip(range(6), colors):
    print(color, end=" ")
# red green blue red green blue

# chain: concatenate iterables
combined = itertools.chain([1, 2], [3, 4], [5, 6])
print(list(combined))  # [1, 2, 3, 4, 5, 6]

# islice: slice an iterator
print(list(itertools.islice(range(100), 5, 10)))
# [5, 6, 7, 8, 9]

# groupby: group consecutive elements
data = [("A", 1), ("A", 2), ("B", 3), ("B", 4), ("A", 5)]
for key, group in itertools.groupby(data, key=lambda x: x[0]):
    print(f"{key}: {list(group)}")
# A: [('A', 1), ('A', 2)]
# B: [('B', 3), ('B', 4)]
# A: [('A', 5)]

# product: cartesian product
for combo in itertools.product("AB", "12"):
    print(combo, end=" ")
# ('A', '1') ('A', '2') ('B', '1') ('B', '2')

# permutations and combinations
print(list(itertools.permutations("ABC", 2)))
# [('A','B'), ('A','C'), ('B','A'), ('B','C'), ('C','A'), ('C','B')]

print(list(itertools.combinations("ABC", 2)))
# [('A', 'B'), ('A', 'C'), ('B', 'C')]

send() and Generator Communication

Generators can receive values via send():

python
def running_average():
    """Generator that computes a running average."""
    total = 0
    count = 0
    average = None

    while True:
        value = yield average
        if value is not None:
            total += value
            count += 1
            average = total / count

avg = running_average()
next(avg)           # Initialize (advance to first yield)
print(avg.send(10)) # 10.0
print(avg.send(20)) # 15.0
print(avg.send(30)) # 20.0
print(avg.send(40)) # 25.0

Practical Example: Data Stream Processor

python
"""
Process a stream of data with generators.
"""

import itertools
from collections import deque

def generate_data():
    """Simulate incoming data points."""
    import random
    while True:
        yield {
            "timestamp": random.randint(1000, 9999),
            "value": random.uniform(0, 100),
            "sensor": random.choice(["A", "B", "C"])
        }

def filter_sensor(data_stream, sensor_id):
    """Filter data by sensor ID."""
    for point in data_stream:
        if point["sensor"] == sensor_id:
            yield point

def moving_average(data_stream, window_size=5):
    """Calculate moving average over a window."""
    window = deque(maxlen=window_size)

    for point in data_stream:
        window.append(point["value"])
        if len(window) == window_size:
            avg = sum(window) / window_size
            yield {**point, "avg": round(avg, 2)}

def detect_anomalies(data_stream, threshold=70):
    """Flag data points above threshold."""
    for point in data_stream:
        if point["avg"] > threshold:
            yield {**point, "anomaly": True}

# Build processing pipeline
raw = generate_data()
sensor_a = filter_sensor(raw, "A")
averaged = moving_average(sensor_a, window_size=3)
anomalies = detect_anomalies(averaged, threshold=60)

# Process first 5 anomalies
for point in itertools.islice(anomalies, 5):
    print(f"Anomaly! Sensor {point['sensor']} "
          f"avg={point['avg']} value={point['value']:.1f}")

Summary

  • Iterators implement __iter__() and __next__() for sequential access
  • Generator functions use yield to produce values lazily, one at a time
  • Generator expressions: (expr for x in iterable) — lazy version of list comprehensions
  • Generators are memory-efficient for large datasets and infinite sequences
  • Use yield from to delegate to sub-generators
  • itertools provides powerful tools: chain, islice, groupby, product, etc.
  • Generators support pipeline processing — chaining transformations efficiently
  • Use send() to communicate values back into a generator

Next, we'll learn about regular expressions in Python.