Advanced Topics

Python Regular Expressions

Master pattern matching in Python with the re module—learn how to search, match, replace, and validate text using regular expressions.

Python Regular Expressions

Regular expressions (regex) are powerful patterns used to search, match, and manipulate text. Python's built-in re module provides full regex support.


Getting Started

python
import re

# Search for a pattern in a string
result = re.search(r"Python", "I love Python programming")
if result:
    print(result.group())  # Python
    print(result.start())  # 7
    print(result.end())    # 13

Always use raw strings (r"pattern") for regex patterns to avoid issues with backslashes.


Core re Functions

FunctionDescription
re.search()Find first match anywhere in string
re.match()Match at the beginning of string
re.fullmatch()Match the entire string
re.findall()Find all matches (returns list of strings)
re.finditer()Find all matches (returns iterator of match objects)
re.sub()Replace matches
re.split()Split string by pattern
re.compile()Compile pattern for reuse
python
import re

text = "Call 555-1234 or 555-5678 for info"

# search - first match
match = re.search(r"\d{3}-\d{4}", text)
print(match.group())  # 555-1234

# findall - all matches
phones = re.findall(r"\d{3}-\d{4}", text)
print(phones)  # ['555-1234', '555-5678']

# sub - replace
cleaned = re.sub(r"\d{3}-\d{4}", "XXX-XXXX", text)
print(cleaned)  # Call XXX-XXXX or XXX-XXXX for info

# split - split by pattern
parts = re.split(r"\s+", "hello    world   python")
print(parts)  # ['hello', 'world', 'python']

Pattern Syntax

Character Classes

python
import re

# . (dot) - any character except newline
re.findall(r"p.t", "pat, pet, pit, pot, put")
# ['pat', 'pet', 'pit', 'pot', 'put']

# \d - digit [0-9]
re.findall(r"\d+", "Order 42, Item 7")
# ['42', '7']

# \w - word character [a-zA-Z0-9_]
re.findall(r"\w+", "hello-world_test 123")
# ['hello', 'world_test', '123']

# \s - whitespace [ \t\n\r\f\v]
re.split(r"\s+", "hello   world\tpython")
# ['hello', 'world', 'python']

# \D, \W, \S - negations
re.findall(r"\D+", "abc123def456")
# ['abc', 'def']

# Custom character class [...]
re.findall(r"[aeiou]", "hello world")
# ['e', 'o', 'o']

re.findall(r"[^aeiou\s]", "hello world")
# ['h', 'l', 'l', 'w', 'r', 'l', 'd']

Quantifiers

python
import re

text = "aab aaab ab aaaab"

# * - zero or more
re.findall(r"a*b", text)    # ['aab', 'aaab', 'ab', 'aaaab']

# + - one or more
re.findall(r"a+b", text)    # ['aab', 'aaab', 'ab', 'aaaab']

# ? - zero or one
re.findall(r"a?b", text)    # ['ab', 'ab', 'ab', 'ab']

# {n} - exactly n
re.findall(r"a{2}b", text)  # ['aab', 'aab', 'aab']

# {n,m} - between n and m
re.findall(r"a{2,3}b", text)  # ['aab', 'aaab', 'aaab']

# Greedy vs lazy
text = "<div>hello</div>"
re.findall(r"<.*>", text)    # ['<div>hello</div>'] - greedy
re.findall(r"<.*?>", text)   # ['<div>', '</div>'] - lazy

Anchors

python
import re

# ^ - start of string
re.search(r"^Hello", "Hello World")  # Match
re.search(r"^Hello", "Say Hello")    # No match

# $ - end of string
re.search(r"World$", "Hello World")  # Match
re.search(r"World$", "World Cup")    # No match

# \b - word boundary
re.findall(r"\bcat\b", "cat catalog scatter")
# ['cat'] - only whole word "cat"

Groups and Capturing

python
import re

# Parentheses create groups
match = re.search(r"(\d{3})-(\d{4})", "Call 555-1234")
print(match.group())   # 555-1234 (full match)
print(match.group(1))  # 555
print(match.group(2))  # 1234
print(match.groups())  # ('555', '1234')

# Named groups
match = re.search(
    r"(?P<area>\d{3})-(?P<number>\d{4})",
    "Call 555-1234"
)
print(match.group("area"))    # 555
print(match.group("number"))  # 1234
print(match.groupdict())      # {'area': '555', 'number': '1234'}

# Non-capturing group
re.findall(r"(?:Mr|Ms|Mrs)\.\s(\w+)", "Mr. Smith and Ms. Jones")
# ['Smith', 'Jones'] - only captures the name

# findall with groups
re.findall(r"(\w+)@(\w+)\.(\w+)", "user@mail.com admin@site.org")
# [('user', 'mail', 'com'), ('admin', 'site', 'org')]

Alternation and Lookaround

python
import re

# Alternation (OR)
re.findall(r"cat|dog", "I have a cat and a dog")
# ['cat', 'dog']

# Lookahead (?=...)
re.findall(r"\w+(?=@)", "user@mail.com admin@site.org")
# ['user', 'admin'] - words followed by @

# Negative lookahead (?!...)
re.findall(r"\d+(?!px)", "10px 20em 30px 40rem")
# ['1', '20', '3', '40'] - digits NOT followed by px

# Lookbehind (?<=...)
re.findall(r"(?<=\$)\d+", "Price: $100 or $200")
# ['100', '200'] - digits preceded by $

# Negative lookbehind (?<!...)
re.findall(r"(?<!\$)\b\d+\b", "Price: $100 or 200")
# ['200'] - numbers NOT preceded by $

Flags

python
import re

# re.IGNORECASE (re.I)
re.findall(r"python", "Python PYTHON python", re.I)
# ['Python', 'PYTHON', 'python']

# re.MULTILINE (re.M)
text = "line1\nline2\nline3"
re.findall(r"^\w+", text, re.M)
# ['line1', 'line2', 'line3']

# re.DOTALL (re.S) - dot matches newline
text = "<div>\nhello\n</div>"
re.findall(r"<div>.*?</div>", text, re.S)
# ['<div>\nhello\n</div>']

# re.VERBOSE (re.X) - allows comments
pattern = re.compile(r"""
    ^                   # Start of string
    [\w.+-]+            # Username
    @                   # @ symbol
    [\w-]+              # Domain name
    \.                  # Dot
    [a-zA-Z]{2,}        # TLD
    $                   # End of string
""", re.VERBOSE)

print(pattern.match("user@example.com"))  # Match

# Combine flags
re.findall(r"^hello", "Hello\nhello", re.I | re.M)
# ['Hello', 'hello']

Compiled Patterns

For patterns used multiple times, compile them for better performance:

python
import re

# Compile once, use many times
email_pattern = re.compile(r"[\w.+-]+@[\w-]+\.[\w.]+")

emails = [
    "user@example.com",
    "invalid-email",
    "admin@site.org",
    "not.an.email",
]

valid = [e for e in emails if email_pattern.fullmatch(e)]
print(valid)  # ['user@example.com', 'admin@site.org']

Common Patterns

python
import re

# Email validation
email = r"^[\w.+-]+@[\w-]+\.[\w.]+$"

# Phone number (US)
phone = r"^\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$"

# URL
url = r"https?://[\w.-]+(?:\.[\w]+)+(?:/[\w./?%&=-]*)?"

# IP address
ip = r"^(?:\d{1,3}\.){3}\d{1,3}$"

# Date (YYYY-MM-DD)
date = r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])$"

# Strong password
password = r"^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$"

# Test patterns
tests = {
    "Email": (email, "user@example.com"),
    "Phone": (phone, "(555) 123-4567"),
    "Date":  (date, "2024-01-15"),
}

for name, (pattern, test_value) in tests.items():
    match = re.fullmatch(pattern, test_value)
    print(f"{name}: {test_value} -> {'Valid' if match else 'Invalid'}")

Practical Example: Log Parser

python
"""
Parse structured log files using regex.
"""

import re
from collections import Counter

log_pattern = re.compile(r"""
    \[(?P<timestamp>\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})\]\s
    (?P<level>INFO|WARNING|ERROR|DEBUG)\s
    (?P<module>[\w.]+):\s
    (?P<message>.+)
""", re.VERBOSE)

sample_logs = """
[2024-01-15 10:30:01] INFO server: Application started
[2024-01-15 10:30:05] ERROR database: Connection timeout after 30s
[2024-01-15 10:30:06] WARNING auth: Failed login for user admin
[2024-01-15 10:30:10] INFO server: Request processed in 150ms
[2024-01-15 10:30:15] ERROR database: Query failed: SELECT * FROM users
[2024-01-15 10:30:20] DEBUG cache: Cache miss for key user_42
""".strip()


def parse_logs(log_text):
    """Parse log entries and return structured data."""
    entries = []
    for line in log_text.split("\n"):
        match = log_pattern.match(line.strip())
        if match:
            entries.append(match.groupdict())
    return entries


entries = parse_logs(sample_logs)

# Count by level
level_counts = Counter(e["level"] for e in entries)
print("Log Level Summary:")
for level, count in level_counts.most_common():
    print(f"  {level}: {count}")

# Show errors
print("\nError Messages:")
for entry in entries:
    if entry["level"] == "ERROR":
        print(f"  [{entry['timestamp']}] {entry['module']}: {entry['message']}")

Summary

  • Python's re module provides full regular expression support
  • Key functions: search(), findall(), sub(), split(), compile()
  • Character classes: \d (digit), \w (word), \s (space), . (any)
  • Quantifiers: * (0+), + (1+), ? (0-1), {n,m} (range)
  • Anchors: ^ (start), $ (end), \b (word boundary)
  • Groups: () capturing, (?P<name>) named, (?:) non-capturing
  • Lookaround: (?=), (?!), (?<=), (?<!)
  • Use re.VERBOSE for readable patterns with comments
  • Compile frequently used patterns with re.compile()

Next, we'll learn about working with APIs in Python.