Python Regular Expressions
Regular expressions (regex) are powerful patterns used to search, match, and manipulate text. Python's built-in re module provides full regex support.
Getting Started
python
import re
# Search for a pattern in a string
result = re.search(r"Python", "I love Python programming")
if result:
print(result.group()) # Python
print(result.start()) # 7
print(result.end()) # 13Always use raw strings (r"pattern") for regex patterns to avoid issues with backslashes.
Core re Functions
| Function | Description |
|---|---|
re.search() | Find first match anywhere in string |
re.match() | Match at the beginning of string |
re.fullmatch() | Match the entire string |
re.findall() | Find all matches (returns list of strings) |
re.finditer() | Find all matches (returns iterator of match objects) |
re.sub() | Replace matches |
re.split() | Split string by pattern |
re.compile() | Compile pattern for reuse |
python
import re
text = "Call 555-1234 or 555-5678 for info"
# search - first match
match = re.search(r"\d{3}-\d{4}", text)
print(match.group()) # 555-1234
# findall - all matches
phones = re.findall(r"\d{3}-\d{4}", text)
print(phones) # ['555-1234', '555-5678']
# sub - replace
cleaned = re.sub(r"\d{3}-\d{4}", "XXX-XXXX", text)
print(cleaned) # Call XXX-XXXX or XXX-XXXX for info
# split - split by pattern
parts = re.split(r"\s+", "hello world python")
print(parts) # ['hello', 'world', 'python']Pattern Syntax
Character Classes
python
import re
# . (dot) - any character except newline
re.findall(r"p.t", "pat, pet, pit, pot, put")
# ['pat', 'pet', 'pit', 'pot', 'put']
# \d - digit [0-9]
re.findall(r"\d+", "Order 42, Item 7")
# ['42', '7']
# \w - word character [a-zA-Z0-9_]
re.findall(r"\w+", "hello-world_test 123")
# ['hello', 'world_test', '123']
# \s - whitespace [ \t\n\r\f\v]
re.split(r"\s+", "hello world\tpython")
# ['hello', 'world', 'python']
# \D, \W, \S - negations
re.findall(r"\D+", "abc123def456")
# ['abc', 'def']
# Custom character class [...]
re.findall(r"[aeiou]", "hello world")
# ['e', 'o', 'o']
re.findall(r"[^aeiou\s]", "hello world")
# ['h', 'l', 'l', 'w', 'r', 'l', 'd']Quantifiers
python
import re
text = "aab aaab ab aaaab"
# * - zero or more
re.findall(r"a*b", text) # ['aab', 'aaab', 'ab', 'aaaab']
# + - one or more
re.findall(r"a+b", text) # ['aab', 'aaab', 'ab', 'aaaab']
# ? - zero or one
re.findall(r"a?b", text) # ['ab', 'ab', 'ab', 'ab']
# {n} - exactly n
re.findall(r"a{2}b", text) # ['aab', 'aab', 'aab']
# {n,m} - between n and m
re.findall(r"a{2,3}b", text) # ['aab', 'aaab', 'aaab']
# Greedy vs lazy
text = "<div>hello</div>"
re.findall(r"<.*>", text) # ['<div>hello</div>'] - greedy
re.findall(r"<.*?>", text) # ['<div>', '</div>'] - lazyAnchors
python
import re
# ^ - start of string
re.search(r"^Hello", "Hello World") # Match
re.search(r"^Hello", "Say Hello") # No match
# $ - end of string
re.search(r"World$", "Hello World") # Match
re.search(r"World$", "World Cup") # No match
# \b - word boundary
re.findall(r"\bcat\b", "cat catalog scatter")
# ['cat'] - only whole word "cat"Groups and Capturing
python
import re
# Parentheses create groups
match = re.search(r"(\d{3})-(\d{4})", "Call 555-1234")
print(match.group()) # 555-1234 (full match)
print(match.group(1)) # 555
print(match.group(2)) # 1234
print(match.groups()) # ('555', '1234')
# Named groups
match = re.search(
r"(?P<area>\d{3})-(?P<number>\d{4})",
"Call 555-1234"
)
print(match.group("area")) # 555
print(match.group("number")) # 1234
print(match.groupdict()) # {'area': '555', 'number': '1234'}
# Non-capturing group
re.findall(r"(?:Mr|Ms|Mrs)\.\s(\w+)", "Mr. Smith and Ms. Jones")
# ['Smith', 'Jones'] - only captures the name
# findall with groups
re.findall(r"(\w+)@(\w+)\.(\w+)", "user@mail.com admin@site.org")
# [('user', 'mail', 'com'), ('admin', 'site', 'org')]Alternation and Lookaround
python
import re
# Alternation (OR)
re.findall(r"cat|dog", "I have a cat and a dog")
# ['cat', 'dog']
# Lookahead (?=...)
re.findall(r"\w+(?=@)", "user@mail.com admin@site.org")
# ['user', 'admin'] - words followed by @
# Negative lookahead (?!...)
re.findall(r"\d+(?!px)", "10px 20em 30px 40rem")
# ['1', '20', '3', '40'] - digits NOT followed by px
# Lookbehind (?<=...)
re.findall(r"(?<=\$)\d+", "Price: $100 or $200")
# ['100', '200'] - digits preceded by $
# Negative lookbehind (?<!...)
re.findall(r"(?<!\$)\b\d+\b", "Price: $100 or 200")
# ['200'] - numbers NOT preceded by $Flags
python
import re
# re.IGNORECASE (re.I)
re.findall(r"python", "Python PYTHON python", re.I)
# ['Python', 'PYTHON', 'python']
# re.MULTILINE (re.M)
text = "line1\nline2\nline3"
re.findall(r"^\w+", text, re.M)
# ['line1', 'line2', 'line3']
# re.DOTALL (re.S) - dot matches newline
text = "<div>\nhello\n</div>"
re.findall(r"<div>.*?</div>", text, re.S)
# ['<div>\nhello\n</div>']
# re.VERBOSE (re.X) - allows comments
pattern = re.compile(r"""
^ # Start of string
[\w.+-]+ # Username
@ # @ symbol
[\w-]+ # Domain name
\. # Dot
[a-zA-Z]{2,} # TLD
$ # End of string
""", re.VERBOSE)
print(pattern.match("user@example.com")) # Match
# Combine flags
re.findall(r"^hello", "Hello\nhello", re.I | re.M)
# ['Hello', 'hello']Compiled Patterns
For patterns used multiple times, compile them for better performance:
python
import re
# Compile once, use many times
email_pattern = re.compile(r"[\w.+-]+@[\w-]+\.[\w.]+")
emails = [
"user@example.com",
"invalid-email",
"admin@site.org",
"not.an.email",
]
valid = [e for e in emails if email_pattern.fullmatch(e)]
print(valid) # ['user@example.com', 'admin@site.org']Common Patterns
python
import re
# Email validation
email = r"^[\w.+-]+@[\w-]+\.[\w.]+$"
# Phone number (US)
phone = r"^\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$"
# URL
url = r"https?://[\w.-]+(?:\.[\w]+)+(?:/[\w./?%&=-]*)?"
# IP address
ip = r"^(?:\d{1,3}\.){3}\d{1,3}$"
# Date (YYYY-MM-DD)
date = r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])$"
# Strong password
password = r"^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$"
# Test patterns
tests = {
"Email": (email, "user@example.com"),
"Phone": (phone, "(555) 123-4567"),
"Date": (date, "2024-01-15"),
}
for name, (pattern, test_value) in tests.items():
match = re.fullmatch(pattern, test_value)
print(f"{name}: {test_value} -> {'Valid' if match else 'Invalid'}")Practical Example: Log Parser
python
"""
Parse structured log files using regex.
"""
import re
from collections import Counter
log_pattern = re.compile(r"""
\[(?P<timestamp>\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})\]\s
(?P<level>INFO|WARNING|ERROR|DEBUG)\s
(?P<module>[\w.]+):\s
(?P<message>.+)
""", re.VERBOSE)
sample_logs = """
[2024-01-15 10:30:01] INFO server: Application started
[2024-01-15 10:30:05] ERROR database: Connection timeout after 30s
[2024-01-15 10:30:06] WARNING auth: Failed login for user admin
[2024-01-15 10:30:10] INFO server: Request processed in 150ms
[2024-01-15 10:30:15] ERROR database: Query failed: SELECT * FROM users
[2024-01-15 10:30:20] DEBUG cache: Cache miss for key user_42
""".strip()
def parse_logs(log_text):
"""Parse log entries and return structured data."""
entries = []
for line in log_text.split("\n"):
match = log_pattern.match(line.strip())
if match:
entries.append(match.groupdict())
return entries
entries = parse_logs(sample_logs)
# Count by level
level_counts = Counter(e["level"] for e in entries)
print("Log Level Summary:")
for level, count in level_counts.most_common():
print(f" {level}: {count}")
# Show errors
print("\nError Messages:")
for entry in entries:
if entry["level"] == "ERROR":
print(f" [{entry['timestamp']}] {entry['module']}: {entry['message']}")Summary
- Python's
remodule provides full regular expression support - Key functions:
search(),findall(),sub(),split(),compile() - Character classes:
\d(digit),\w(word),\s(space),.(any) - Quantifiers:
*(0+),+(1+),?(0-1),{n,m}(range) - Anchors:
^(start),$(end),\b(word boundary) - Groups:
()capturing,(?P<name>)named,(?:)non-capturing - Lookaround:
(?=),(?!),(?<=),(?<!) - Use
re.VERBOSEfor readable patterns with comments - Compile frequently used patterns with
re.compile()
Next, we'll learn about working with APIs in Python.