Validating email addresses with .NET regex

code 10 May 2011 | 0 Comments

I did validation in Haskell a while back, and since I recently discovered .NET’s “balancing groups” regex feature, it seems like it would be a good time to do it for .NET.

Here is the code:

var crlf = @"(\r\n)";
var wsp = @"[ \t]";
var vchar = @"[\u0021-\u007e]";
var obsNoWsCtl = @"[\u0001-\u0008\u000b\u000c\u000e-\u001f\u007f]";
var commentText = @"([\u0021-\u0027\u002a-\u005b\u005d-\u007e]|" + obsNoWsCtl + ")";
var quotedPair = @"\\(" + vchar + "|" + wsp + @"|[\r\n\0]|" + obsNoWsCtl + ")";
var quotedText = @"([\u0021\u0023-\u005b\u005d-\u007e]|"+obsNoWsCtl+")";
var domainText = "([\u0021-\u005A\u005e-\u007e]|" + obsNoWsCtl + ")";
var atomText = "[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]";
var fws = "(" + wsp + "+" + "(" + crlf + wsp + "+)?|(" + crlf + wsp + "+)+)";
var comment = @"(\((((?'paren'\()|(?'-paren'\))|"+commentText+"|"+fws+"|"+quotedPair+@")*(?(paren)(?!)))\))";
var cfws = "(" + comment + "|" + fws + ")*";
var atom = "(" + atomText + "+)";
var quotedContent = "(" + quotedText + "|" + quotedPair + ")";
var quotedString = "(\"(" + fws +"?" + quotedContent + ")*" + fws + "?\")";
var dottedAtoms1 = "(" + cfws + "?(" + atom + "|" + quotedString + ")" + cfws + "?)";
var dottedAtoms = "(" + dottedAtoms1 + @"(\." + dottedAtoms1 + ")*)";
var localPart = "(?'localPart'" + dottedAtoms + ")";
var domainLiteral = "(" + cfws + @"?\[(" + fws + "?" + domainText + ")*" + fws + @"?\]" + cfws +"?)";
var domain = "(?'domain'" + dottedAtoms + "|" + domainLiteral + ")";
var email = "^" + localPart + "@" + domain + "$";
 
// Console.WriteLine(email);
 
var valid = @"I.
am.
a.
nice.
guy@(yeah)you.com";
 
var success = Regex.Match(valid, email, RegexOptions.ExplicitCapture|RegexOptions.Singleline);
Console.WriteLine("Parsed: {0} <@> {1}", success.Groups["localPart"], success.Groups["domain"]);
view raw gistfile1.cs This Gist brought to you by GitHub.

And here is the regex. Despite supporting comments and a more recent RFC, it’s about ⅔ of the size of Perl’s Mail::RFC822::Address .

^(?'localPart'((((\((((?'paren'\()|(?'-paren'\))|([\u0021-\u
0027\u002a-\u005b\u005d-\u007e]|[\u0001-\u0008\u000b\u000c\u
000e-\u001f\u007f])|([ \t]+((\r\n)[ \t]+)?|((\r\n)[ \t]+)+)|
\\([\u0021-\u007e]|[ \t]|[\r\n\0]|[\u0001-\u0008\u000b\u000c
\u000e-\u001f\u007f]))*(?(paren)(?!)))\))|([ \t]+((\r\n)[ \t
]+)?|((\r\n)[ \t]+)+))*?(([a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+)|(
"(([ \t]+((\r\n)[ \t]+)?|((\r\n)[ \t]+)+)?(([\u0021\u0023-\u
005b\u005d-\u007e]|[\u0001-\u0008\u000b\u000c\u000e-\u001f\u
007f])|\\([\u0021-\u007e]|[ \t]|[\r\n\0]|[\u0001-\u0008\u000
b\u000c\u000e-\u001f\u007f])))*([ \t]+((\r\n)[ \t]+)?|((\r\n
)[ \t]+)+)?"))((\((((?'paren'\()|(?'-paren'\))|([\u0021-\u00
27\u002a-\u005b\u005d-\u007e]|[\u0001-\u0008\u000b\u000c\u00
0e-\u001f\u007f])|([ \t]+((\r\n)[ \t]+)?|((\r\n)[ \t]+)+)|\\
([\u0021-\u007e]|[ \t]|[\r\n\0]|[\u0001-\u0008\u000b\u000c\u
000e-\u001f\u007f]))*(?(paren)(?!)))\))|([ \t]+((\r\n)[ \t]+
)?|((\r\n)[ \t]+)+))*?)(\.(((\((((?'paren'\()|(?'-paren'\))|
([\u0021-\u0027\u002a-\u005b\u005d-\u007e]|[\u0001-\u0008\u0
00b\u000c\u000e-\u001f\u007f])|([ \t]+((\r\n)[ \t]+)?|((\r\n
)[ \t]+)+)|\\([\u0021-\u007e]|[ \t]|[\r\n\0]|[\u0001-\u0008\
u000b\u000c\u000e-\u001f\u007f]))*(?(paren)(?!)))\))|([ \t]+
((\r\n)[ \t]+)?|((\r\n)[ \t]+)+))*?(([a-zA-Z0-9!#$%&'*+/=?^_
`{|}~-]+)|("(([ \t]+((\r\n)[ \t]+)?|((\r\n)[ \t]+)+)?(([\u00
21\u0023-\u005b\u005d-\u007e]|[\u0001-\u0008\u000b\u000c\u00
0e-\u001f\u007f])|\\([\u0021-\u007e]|[ \t]|[\r\n\0]|[\u0001-
\u0008\u000b\u000c\u000e-\u001f\u007f])))*([ \t]+((\r\n)[ \t
]+)?|((\r\n)[ \t]+)+)?"))((\((((?'paren'\()|(?'-paren'\))|([
\u0021-\u0027\u002a-\u005b\u005d-\u007e]|[\u0001-\u0008\u000
b\u000c\u000e-\u001f\u007f])|([ \t]+((\r\n)[ \t]+)?|((\r\n)[
\t]+)+)|\\([\u0021-\u007e]|[ \t]|[\r\n\0]|[\u0001-\u0008\u0
00b\u000c\u000e-\u001f\u007f]))*(?(paren)(?!)))\))|([ \t]+((
\r\n)[ \t]+)?|((\r\n)[ \t]+)+))*?))*))@(?'domain'((((\((((?'
paren'\()|(?'-paren'\))|([\u0021-\u0027\u002a-\u005b\u005d-\
u007e]|[\u0001-\u0008\u000b\u000c\u000e-\u001f\u007f])|([ \t
]+((\r\n)[ \t]+)?|((\r\n)[ \t]+)+)|\\([\u0021-\u007e]|[ \t]|
[\r\n\0]|[\u0001-\u0008\u000b\u000c\u000e-\u001f\u007f]))*(?
(paren)(?!)))\))|([ \t]+((\r\n)[ \t]+)?|((\r\n)[ \t]+)+))*?(
([a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+)|("(([ \t]+((\r\n)[ \t]+)?|
((\r\n)[ \t]+)+)?(([\u0021\u0023-\u005b\u005d-\u007e]|[\u000
1-\u0008\u000b\u000c\u000e-\u001f\u007f])|\\([\u0021-\u007e]
|[ \t]|[\r\n\0]|[\u0001-\u0008\u000b\u000c\u000e-\u001f\u007
f])))*([ \t]+((\r\n)[ \t]+)?|((\r\n)[ \t]+)+)?"))((\((((?'pa
ren'\()|(?'-paren'\))|([\u0021-\u0027\u002a-\u005b\u005d-\u0
07e]|[\u0001-\u0008\u000b\u000c\u000e-\u001f\u007f])|([ \t]+
((\r\n)[ \t]+)?|((\r\n)[ \t]+)+)|\\([\u0021-\u007e]|[ \t]|[\
r\n\0]|[\u0001-\u0008\u000b\u000c\u000e-\u001f\u007f]))*(?(p
aren)(?!)))\))|([ \t]+((\r\n)[ \t]+)?|((\r\n)[ \t]+)+))*?)(\
.(((\((((?'paren'\()|(?'-paren'\))|([\u0021-\u0027\u002a-\u0
05b\u005d-\u007e]|[\u0001-\u0008\u000b\u000c\u000e-\u001f\u0
07f])|([ \t]+((\r\n)[ \t]+)?|((\r\n)[ \t]+)+)|\\([\u0021-\u0
07e]|[ \t]|[\r\n\0]|[\u0001-\u0008\u000b\u000c\u000e-\u001f\
u007f]))*(?(paren)(?!)))\))|([ \t]+((\r\n)[ \t]+)?|((\r\n)[
\t]+)+))*?(([a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+)|("(([ \t]+((\r\
n)[ \t]+)?|((\r\n)[ \t]+)+)?(([\u0021\u0023-\u005b\u005d-\u0
07e]|[\u0001-\u0008\u000b\u000c\u000e-\u001f\u007f])|\\([\u0
021-\u007e]|[ \t]|[\r\n\0]|[\u0001-\u0008\u000b\u000c\u000e-
\u001f\u007f])))*([ \t]+((\r\n)[ \t]+)?|((\r\n)[ \t]+)+)?"))
((\((((?'paren'\()|(?'-paren'\))|([\u0021-\u0027\u002a-\u005
b\u005d-\u007e]|[\u0001-\u0008\u000b\u000c\u000e-\u001f\u007
f])|([ \t]+((\r\n)[ \t]+)?|((\r\n)[ \t]+)+)|\\([\u0021-\u007
e]|[ \t]|[\r\n\0]|[\u0001-\u0008\u000b\u000c\u000e-\u001f\u0
07f]))*(?(paren)(?!)))\))|([ \t]+((\r\n)[ \t]+)?|((\r\n)[ \t
]+)+))*?))*)|(((\((((?'paren'\()|(?'-paren'\))|([\u0021-\u00
27\u002a-\u005b\u005d-\u007e]|[\u0001-\u0008\u000b\u000c\u00
0e-\u001f\u007f])|([ \t]+((\r\n)[ \t]+)?|((\r\n)[ \t]+)+)|\\
([\u0021-\u007e]|[ \t]|[\r\n\0]|[\u0001-\u0008\u000b\u000c\u
000e-\u001f\u007f]))*(?(paren)(?!)))\))|([ \t]+((\r\n)[ \t]+
)?|((\r\n)[ \t]+)+))*?\[(([ \t]+((\r\n)[ \t]+)?|((\r\n)[ \t]
+)+)?([!-Z^-~]|[\u0001-\u0008\u000b\u000c\u000e-\u001f\u007f
]))*([ \t]+((\r\n)[ \t]+)?|((\r\n)[ \t]+)+)?\]((\((((?'paren
'\()|(?'-paren'\))|([\u0021-\u0027\u002a-\u005b\u005d-\u007e
]|[\u0001-\u0008\u000b\u000c\u000e-\u001f\u007f])|([ \t]+((\
r\n)[ \t]+)?|((\r\n)[ \t]+)+)|\\([\u0021-\u007e]|[ \t]|[\r\n
\0]|[\u0001-\u0008\u000b\u000c\u000e-\u001f\u007f]))*(?(pare
n)(?!)))\))|([ \t]+((\r\n)[ \t]+)?|((\r\n)[ \t]+)+))*?))$

Tagged in , , , ,

Leave a Reply