Regular Expressions : RegEx

.       - Any Character Except New Line
\d - Digit (0-9)
\D - Not a Digit (0-9)
\w - Word Character (a-z, A-Z, 0-9, _)
\W - Not a Word Character
\s - Whitespace (space, tab, newline)
\S - Not Whitespace (space, tab, newline)
\b - Word Boundary
\B - Not a Word Boundary
^ - Beginning of a String
$ - End of a String
[] - Matches Characters in brackets
[^ ] - Matches Characters NOT in brackets
| - Either Or
( ) - Group
Quantifiers:
* - 0 or More
+ - 1 or More
? - 0 or One
{3} - Exact Number
{3,4} - Range of Numbers (Minimum, Maximum)
import re

text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
123abc

Hello HelloHello

MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )

gmail.com

321-555-4321
123.555.1234

abhi-arya@gmail.com

Mr. Johnson
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''
pattern = re.compile(r'cba')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
Output: Will search for the literals 'cba' in the text.

Searching special characters

Different special characters:

Square brackets []
Square brackets are used to specify a character set — at least one of which must be a match, but no more than one unless otherwise specified.

pattern = re.compile(r'.')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
Output: Any Character Except New Line
pattern = re.compile(r'\.')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
pattern = re.compile(r'\D')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
pattern = re.compile(r'\d\w')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
pattern = re.compile(r'\d\s')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)

Word boundary

# Hello HelloHello
pattern = re.compile(r'Hello') #searching for 'Hello'
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
<re.Match object; span=(74, 79), match='Hello'>
<re.Match object; span=(80, 85), match='Hello'>
<re.Match object; span=(85, 90), match='Hello'>
pattern = re.compile(r'Hello\b') #Word Boundary
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
<re.Match object; span=(74, 79), match=’Hello’> 
<re.Match object; span=(85, 90), match=’Hello’>
Note: Here we searched for the pattern were 'Hello' was followed by word boundary(space/new line etc.)
pattern = re.compile(r'\bHello\b')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
<re.Match object; span=(74, 79), match='Hello'>
Other Worth trying examples:pattern = re.compile(r'\BHello\b') # \B - Not word boundary
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)

pattern = re.compile(r'\b\d')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
pattern = re.compile(r'^\s') #Whitespace (space, tab, newline) at beginning of string
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)

Character sets

pattern = re.compile(r'[123]\w')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
<re.Match object; span=(55, 57), match='12'>
<re.Match object; span=(57, 59), match='34'>
<re.Match object; span=(66, 68), match='12'>
<re.Match object; span=(68, 70), match='3a'>
<re.Match object; span=(169, 171), match='32'>
<re.Match object; span=(178, 180), match='32'>
<re.Match object; span=(182, 184), match='12'>
<re.Match object; span=(190, 192), match='12'>
<re.Match object; span=(192, 194), match='34'>
pattern = re.compile(r'[a-z][a-z]')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
pattern = re.compile(r'[a-zA-Z0-9][a-zA-z-]')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
pattern = re.compile(r'[a-zA-Z][^a-zA-z]')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)

Character groups

pattern = re.compile(r'(abc|com|texas)\b')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
pattern = re.compile(r'([A-Z]|llo)[a-zA-z]')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)

Quantifiers

pattern = re.compile(r'Mr\.?\s[A-Z]')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
pattern = re.compile(r'Mr\.?\s[A-Z][a-z]*')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
pattern = re.compile(r'M(s|rs)\.?\s[A-Z][a-z]*')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
pattern = re.compile(r'\d{3}[.-]\d{3}[.-]\d{4}')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
pattern = re.compile(r'[a-zA-Z0-9_]+\.[a-z]{3}')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)
pattern = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat)

Accessing information in the Match object

pattern = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{2,4}')
matches = pattern.finditer(text_to_search)
for mat in matches:
print(mat.span(0))
print(mat.group(0))
print(text_to_search[mat.span(0)[0]:mat.span(0)[1]])
urls = r'''
https://www.google.com
http://yahoo.com
https://www.whitehouse.gov
https://craigslist.org
'''
pattern = re.compile(r'https?://(www\.)?\w+\.\w+')
matches = pattern.finditer(urls)
for mat in matches:
print(mat)
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = pattern.finditer(urls)
for mat in matches:
print(mat.group(2)+mat.group(3))
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = pattern.finditer(urls)
for mat in matches:
print(mat.group(0))
print(urls[mat.span(2)[0]:mat.span(2)[1]]+urls[mat.span(3)[0]:mat.span(3)[1]])

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Abhinandan Arya

Abhinandan Arya

Senior Applied Scientist | Satellite, Space and Earth Observation (EO) Data