Python Regular Expressions

Regular expressions (regex) are powerful patterns used to search, match, and manipulate text. Python's built-in re module provides full regex support.

Getting Started

python

import re

# Search for a pattern in a string
result = re.search(r"Python", "I love Python programming")
if result:
    print(result.group())  # Python
    print(result.start())  # 7
    print(result.end())    # 13

Always use raw strings (r"pattern") for regex patterns to avoid issues with backslashes.

Core `re` Functions

Function	Description
`re.search()`	Find first match anywhere in string
`re.match()`	Match at the beginning of string
`re.fullmatch()`	Match the entire string
`re.findall()`	Find all matches (returns list of strings)
`re.finditer()`	Find all matches (returns iterator of match objects)
`re.sub()`	Replace matches
`re.split()`	Split string by pattern
`re.compile()`	Compile pattern for reuse

python

import re

text = "Call 555-1234 or 555-5678 for info"

# search - first match
match = re.search(r"\d{3}-\d{4}", text)
print(match.group())  # 555-1234

# findall - all matches
phones = re.findall(r"\d{3}-\d{4}", text)
print(phones)  # ['555-1234', '555-5678']

# sub - replace
cleaned = re.sub(r"\d{3}-\d{4}", "XXX-XXXX", text)
print(cleaned)  # Call XXX-XXXX or XXX-XXXX for info

# split - split by pattern
parts = re.split(r"\s+", "hello    world   python")
print(parts)  # ['hello', 'world', 'python']

Pattern Syntax

Character Classes

python

import re

# . (dot) - any character except newline
re.findall(r"p.t", "pat, pet, pit, pot, put")
# ['pat', 'pet', 'pit', 'pot', 'put']

# \d - digit [0-9]
re.findall(r"\d+", "Order 42, Item 7")
# ['42', '7']

# \w - word character [a-zA-Z0-9_]
re.findall(r"\w+", "hello-world_test 123")
# ['hello', 'world_test', '123']

# \s - whitespace [ \t\n\r\f\v]
re.split(r"\s+", "hello   world\tpython")
# ['hello', 'world', 'python']

# \D, \W, \S - negations
re.findall(r"\D+", "abc123def456")
# ['abc', 'def']

# Custom character class [...]
re.findall(r"[aeiou]", "hello world")
# ['e', 'o', 'o']

re.findall(r"[^aeiou\s]", "hello world")
# ['h', 'l', 'l', 'w', 'r', 'l', 'd']

Quantifiers

python

import re

text = "aab aaab ab aaaab"

# * - zero or more
re.findall(r"a*b", text)    # ['aab', 'aaab', 'ab', 'aaaab']

# + - one or more
re.findall(r"a+b", text)    # ['aab', 'aaab', 'ab', 'aaaab']

# ? - zero or one
re.findall(r"a?b", text)    # ['ab', 'ab', 'ab', 'ab']

# {n} - exactly n
re.findall(r"a{2}b", text)  # ['aab', 'aab', 'aab']

# {n,m} - between n and m
re.findall(r"a{2,3}b", text)  # ['aab', 'aaab', 'aaab']

# Greedy vs lazy
text = "<div>hello</div>"
re.findall(r"<.*>", text)    # ['<div>hello</div>'] - greedy
re.findall(r"<.*?>", text)   # ['<div>', '</div>'] - lazy

Anchors

python

import re

# ^ - start of string
re.search(r"^Hello", "Hello World")  # Match
re.search(r"^Hello", "Say Hello")    # No match

# $ - end of string
re.search(r"World$", "Hello World")  # Match
re.search(r"World$", "World Cup")    # No match

# \b - word boundary
re.findall(r"\bcat\b", "cat catalog scatter")
# ['cat'] - only whole word "cat"

Groups and Capturing

python

import re

# Parentheses create groups
match = re.search(r"(\d{3})-(\d{4})", "Call 555-1234")
print(match.group())   # 555-1234 (full match)
print(match.group(1))  # 555
print(match.group(2))  # 1234
print(match.groups())  # ('555', '1234')

# Named groups
match = re.search(
    r"(?P<area>\d{3})-(?P<number>\d{4})",
    "Call 555-1234"
)
print(match.group("area"))    # 555
print(match.group("number"))  # 1234
print(match.groupdict())      # {'area': '555', 'number': '1234'}

# Non-capturing group
re.findall(r"(?:Mr|Ms|Mrs)\.\s(\w+)", "Mr. Smith and Ms. Jones")
# ['Smith', 'Jones'] - only captures the name

# findall with groups
re.findall(r"(\w+)@(\w+)\.(\w+)", "user@mail.com admin@site.org")
# [('user', 'mail', 'com'), ('admin', 'site', 'org')]

Alternation and Lookaround

python

import re

# Alternation (OR)
re.findall(r"cat|dog", "I have a cat and a dog")
# ['cat', 'dog']

# Lookahead (?=...)
re.findall(r"\w+(?=@)", "user@mail.com admin@site.org")
# ['user', 'admin'] - words followed by @

# Negative lookahead (?!...)
re.findall(r"\d+(?!px)", "10px 20em 30px 40rem")
# ['1', '20', '3', '40'] - digits NOT followed by px

# Lookbehind (?<=...)
re.findall(r"(?<=\$)\d+", "Price: $100 or $200")
# ['100', '200'] - digits preceded by $

# Negative lookbehind (?<!...)
re.findall(r"(?<!\$)\b\d+\b", "Price: $100 or 200")
# ['200'] - numbers NOT preceded by $

Flags

python

import re

# re.IGNORECASE (re.I)
re.findall(r"python", "Python PYTHON python", re.I)
# ['Python', 'PYTHON', 'python']

# re.MULTILINE (re.M)
text = "line1\nline2\nline3"
re.findall(r"^\w+", text, re.M)
# ['line1', 'line2', 'line3']

# re.DOTALL (re.S) - dot matches newline
text = "<div>\nhello\n</div>"
re.findall(r"<div>.*?</div>", text, re.S)
# ['<div>\nhello\n</div>']

# re.VERBOSE (re.X) - allows comments
pattern = re.compile(r"""
    ^                   # Start of string
    [\w.+-]+            # Username
    @                   # @ symbol
    [\w-]+              # Domain name
    \.                  # Dot
    [a-zA-Z]{2,}        # TLD
    $                   # End of string
""", re.VERBOSE)

print(pattern.match("user@example.com"))  # Match

# Combine flags
re.findall(r"^hello", "Hello\nhello", re.I | re.M)
# ['Hello', 'hello']

Compiled Patterns

For patterns used multiple times, compile them for better performance:

python

import re

# Compile once, use many times
email_pattern = re.compile(r"[\w.+-]+@[\w-]+\.[\w.]+")

emails = [
    "user@example.com",
    "invalid-email",
    "admin@site.org",
    "not.an.email",
]

valid = [e for e in emails if email_pattern.fullmatch(e)]
print(valid)  # ['user@example.com', 'admin@site.org']

Common Patterns

python

import re

# Email validation
email = r"^[\w.+-]+@[\w-]+\.[\w.]+$"

# Phone number (US)
phone = r"^\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$"

# URL
url = r"https?://[\w.-]+(?:\.[\w]+)+(?:/[\w./?%&=-]*)?"

# IP address
ip = r"^(?:\d{1,3}\.){3}\d{1,3}$"

# Date (YYYY-MM-DD)
date = r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])$"

# Strong password
password = r"^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$"

# Test patterns
tests = {
    "Email": (email, "user@example.com"),
    "Phone": (phone, "(555) 123-4567"),
    "Date":  (date, "2024-01-15"),
}

for name, (pattern, test_value) in tests.items():
    match = re.fullmatch(pattern, test_value)
    print(f"{name}: {test_value} -> {'Valid' if match else 'Invalid'}")

Practical Example: Log Parser

python

"""
Parse structured log files using regex.
"""

import re
from collections import Counter

log_pattern = re.compile(r"""
    \[(?P<timestamp>\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})\]\s
    (?P<level>INFO|WARNING|ERROR|DEBUG)\s
    (?P<module>[\w.]+):\s
    (?P<message>.+)
""", re.VERBOSE)

sample_logs = """
[2024-01-15 10:30:01] INFO server: Application started
[2024-01-15 10:30:05] ERROR database: Connection timeout after 30s
[2024-01-15 10:30:06] WARNING auth: Failed login for user admin
[2024-01-15 10:30:10] INFO server: Request processed in 150ms
[2024-01-15 10:30:15] ERROR database: Query failed: SELECT * FROM users
[2024-01-15 10:30:20] DEBUG cache: Cache miss for key user_42
""".strip()


def parse_logs(log_text):
    """Parse log entries and return structured data."""
    entries = []
    for line in log_text.split("\n"):
        match = log_pattern.match(line.strip())
        if match:
            entries.append(match.groupdict())
    return entries


entries = parse_logs(sample_logs)

# Count by level
level_counts = Counter(e["level"] for e in entries)
print("Log Level Summary:")
for level, count in level_counts.most_common():
    print(f"  {level}: {count}")

# Show errors
print("\nError Messages:")
for entry in entries:
    if entry["level"] == "ERROR":
        print(f"  [{entry['timestamp']}] {entry['module']}: {entry['message']}")

Summary

Python's re module provides full regular expression support
Key functions: search(), findall(), sub(), split(), compile()
Character classes: \d (digit), \w (word), \s (space), . (any)
Quantifiers: * (0+), + (1+), ? (0-1), {n,m} (range)
Anchors: ^ (start), $ (end), \b (word boundary)
Groups: () capturing, (?P<name>) named, (?:) non-capturing
Lookaround: (?=), (?!), (?<=), (?<!)
Use re.VERBOSE for readable patterns with comments
Compile frequently used patterns with re.compile()

Next, we'll learn about working with APIs in Python.

Python Regular Expressions

Python Regular Expressions

Getting Started

Core re Functions

Pattern Syntax

Character Classes

Quantifiers

Anchors

Groups and Capturing

Alternation and Lookaround

Flags

Compiled Patterns

Common Patterns

Practical Example: Log Parser

Summary

Core `re` Functions