Regex


Regex

regular expression = a sequence of characters that specifies a search pattern

presenter: Juliana Gretz
date: 8.10.2021

character

command meaning
. any character
\s any whitespace character
\S any non-whitespace character
\w any word character
\W any non-word character
\d any digit
\D any non-digit

range

command meaning
[abc] character of: a, b or c
[^abc] character except of: a, b or c
[a-z] character in the rage of: a-z
[^a-z] character not in the rage of: a-z
[0-9] character in the rage of: 0-9
[^0-9] character not in the rage of: 0-9
[a|b] a or b

quantifiers

command meaning
a? zero or one of a
a* zero or more of a
a+ one or zero of a
a{3} 3 of a
a{3,} 3 or more of a
a{3,6} between 3 and 6 of a

boundry

command meaning
^ start of string
$ end of string
\b word boundry
\B non-word boundry

group

command meaning
(xxx) around the expression xxx to create group
(?P\<name>xxx) around the expression xxx to name group

Examples

In [1]:
import re
import pandas as pd
In [2]:
datafiles = [
    '20F-22003_DC8_D03_1032246_P1_UV-metric psKa.t3r',
    '20F-29001_DC8_D03_1032246_P1_W_UV-metric psKa.t3r',
    '20F-26012_DC29_F21_1088043_P1_UV-metric psKa.t3r',
    '20G-03010_DC60_L03_1366347_P1_UV-metric pKa.t3r',
    '20G-04001_DC28_F19_1087258_P1_W_UV-metric psKa.t3r',
    '21I-27003_DC245_D13_7018930044_P1_UV-metric psKa.t3r',
    '20H-26004_DC124_J03_1216895_P2_UV-metric pKa.t3r',
    '20I-03024_DC168_L12_105750012_P2_UV-metric psKa.t3r',
    '20J-05001_DC132_K06_1298433_P2_W_UV-metric psKa.t3r',
    '21I-14006_DC218_F07_1056516_P3_UV-metric psKa.t3r',
]

just fitting a regular expression

"\d{2}\w-\d{5}_DC\d{1,3}_\w\d{2}_\d*_P[1-3]_W*_*UV-metric ps*Ka.t3r"g
https://regex101.com/r/K0vABT/4

In [3]:
df = pd.DataFrame({'filenames':datafiles})
df['match']=[re.search(r"\d{2}\w-\d{5}_DC\d{1,3}_\w\d{2}_\d*_P[1-3]_W*_*UV-metric ps*Ka.t3r",file)[0] 
             for file in df.filenames]
df
Out[3]:
filenames match
0 20F-22003_DC8_D03_1032246_P1_UV-metric psKa.t3r 20F-22003_DC8_D03_1032246_P1_UV-metric psKa.t3r
1 20F-29001_DC8_D03_1032246_P1_W_UV-metric psKa.t3r 20F-29001_DC8_D03_1032246_P1_W_UV-metric psKa.t3r
2 20F-26012_DC29_F21_1088043_P1_UV-metric psKa.t3r 20F-26012_DC29_F21_1088043_P1_UV-metric psKa.t3r
3 20G-03010_DC60_L03_1366347_P1_UV-metric pKa.t3r 20G-03010_DC60_L03_1366347_P1_UV-metric pKa.t3r
4 20G-04001_DC28_F19_1087258_P1_W_UV-metric psKa... 20G-04001_DC28_F19_1087258_P1_W_UV-metric psKa...
5 21I-27003_DC245_D13_7018930044_P1_UV-metric ps... 21I-27003_DC245_D13_7018930044_P1_UV-metric ps...
6 20H-26004_DC124_J03_1216895_P2_UV-metric pKa.t3r 20H-26004_DC124_J03_1216895_P2_UV-metric pKa.t3r
7 20I-03024_DC168_L12_105750012_P2_UV-metric psK... 20I-03024_DC168_L12_105750012_P2_UV-metric psK...
8 20J-05001_DC132_K06_1298433_P2_W_UV-metric psK... 20J-05001_DC132_K06_1298433_P2_W_UV-metric psK...
9 21I-14006_DC218_F07_1056516_P3_UV-metric psKa.t3r 21I-14006_DC218_F07_1056516_P3_UV-metric psKa.t3r

creating groups

r"\d{2}\w-\d{5}_(DC\d{1,3})_(\w\d{2})_(\d{7,10})_(P[1-3])_(W_)*UV-metric (psKa|pKa).t3r"g
https://regex101.com/r/UkpsCd/3

In [4]:
df = pd.DataFrame({'filenames':datafiles})
df['matches']=[re.search(r"\d{2}\w-\d{5}_(DC\d{1,3})_(\w\d{2})_(\d*)_(P[1-3])_(W_)*UV-metric (psKa|pKa).t3r",file) 
               for file in df.filenames]
number_of_groups = len(df.matches[0].groups())
for i in range(number_of_groups):
    df[f'group{i}']=[match.groups()[i] for match in df.matches]
df
Out[4]:
filenames matches group0 group1 group2 group3 group4 group5
0 20F-22003_DC8_D03_1032246_P1_UV-metric psKa.t3r <re.Match object; span=(0, 47), match='20F-220... DC8 D03 1032246 P1 None psKa
1 20F-29001_DC8_D03_1032246_P1_W_UV-metric psKa.t3r <re.Match object; span=(0, 49), match='20F-290... DC8 D03 1032246 P1 W_ psKa
2 20F-26012_DC29_F21_1088043_P1_UV-metric psKa.t3r <re.Match object; span=(0, 48), match='20F-260... DC29 F21 1088043 P1 None psKa
3 20G-03010_DC60_L03_1366347_P1_UV-metric pKa.t3r <re.Match object; span=(0, 47), match='20G-030... DC60 L03 1366347 P1 None pKa
4 20G-04001_DC28_F19_1087258_P1_W_UV-metric psKa... <re.Match object; span=(0, 50), match='20G-040... DC28 F19 1087258 P1 W_ psKa
5 21I-27003_DC245_D13_7018930044_P1_UV-metric ps... <re.Match object; span=(0, 52), match='21I-270... DC245 D13 7018930044 P1 None psKa
6 20H-26004_DC124_J03_1216895_P2_UV-metric pKa.t3r <re.Match object; span=(0, 48), match='20H-260... DC124 J03 1216895 P2 None pKa
7 20I-03024_DC168_L12_105750012_P2_UV-metric psK... <re.Match object; span=(0, 51), match='20I-030... DC168 L12 105750012 P2 None psKa
8 20J-05001_DC132_K06_1298433_P2_W_UV-metric psK... <re.Match object; span=(0, 51), match='20J-050... DC132 K06 1298433 P2 W_ psKa
9 21I-14006_DC218_F07_1056516_P3_UV-metric psKa.t3r <re.Match object; span=(0, 49), match='21I-140... DC218 F07 1056516 P3 None psKa

name groups

r"\d{2}\w-\d{5}_(?P<code>DC\d{1,3})_(?P<well_adress>\w\d{2})_(?P<id>\d*)_(?P<plate>P[1-3])_(?P<wdh>W_)*UV-metric (?P<solvent>psKa|pKa).t3r"g
https://regex101.com/r/IOqHvt/2

In [5]:
df = pd.DataFrame({'filenames':datafiles})
df['matches']=[re.search(r"\d{2}\w-\d{5}_(?P<code>DC\d{1,3})_(?P<well_adress>\w\d{2})_(?P<id>\d*)_(?P<plate>P[1-3])_"
                         r"(?P<wdh>W_)*UV-metric (?P<solvent>psKa|pKa).t3r",file) 
                         for file in df.filenames]
for variable in ['code','well_adress','id','plate','solvent','wdh']:
    df[variable]=[match.group(variable) for match in df.matches]
df
Out[5]:
filenames matches code well_adress id plate solvent wdh
0 20F-22003_DC8_D03_1032246_P1_UV-metric psKa.t3r <re.Match object; span=(0, 47), match='20F-220... DC8 D03 1032246 P1 psKa None
1 20F-29001_DC8_D03_1032246_P1_W_UV-metric psKa.t3r <re.Match object; span=(0, 49), match='20F-290... DC8 D03 1032246 P1 psKa W_
2 20F-26012_DC29_F21_1088043_P1_UV-metric psKa.t3r <re.Match object; span=(0, 48), match='20F-260... DC29 F21 1088043 P1 psKa None
3 20G-03010_DC60_L03_1366347_P1_UV-metric pKa.t3r <re.Match object; span=(0, 47), match='20G-030... DC60 L03 1366347 P1 pKa None
4 20G-04001_DC28_F19_1087258_P1_W_UV-metric psKa... <re.Match object; span=(0, 50), match='20G-040... DC28 F19 1087258 P1 psKa W_
5 21I-27003_DC245_D13_7018930044_P1_UV-metric ps... <re.Match object; span=(0, 52), match='21I-270... DC245 D13 7018930044 P1 psKa None
6 20H-26004_DC124_J03_1216895_P2_UV-metric pKa.t3r <re.Match object; span=(0, 48), match='20H-260... DC124 J03 1216895 P2 pKa None
7 20I-03024_DC168_L12_105750012_P2_UV-metric psK... <re.Match object; span=(0, 51), match='20I-030... DC168 L12 105750012 P2 psKa None
8 20J-05001_DC132_K06_1298433_P2_W_UV-metric psK... <re.Match object; span=(0, 51), match='20J-050... DC132 K06 1298433 P2 psKa W_
9 21I-14006_DC218_F07_1056516_P3_UV-metric psKa.t3r <re.Match object; span=(0, 49), match='21I-140... DC218 F07 1056516 P3 psKa None
In [ ]:
 
In [ ]:
 
In [ ]: