Regex functions

A set of functions I’ve written to exploit the regular expression engine in vbscript, making the regular expression pattern-matching available on the Excel worksheet. Note that these all use early binding to VBScript_RegExp_55 in vbscript.dll. These are available for download here, bundled into an .xla file. Source code is as follows:

Public Function RXFIND(ByRef find_pattern As Variant, _
                        ByRef within_text As Variant, _
                        Optional ByVal start_num As Long = 0, _
                        Optional ByVal case_sensitive As Boolean = True) As Variant
' RXFIND - Returns the starting position of text matching the regex pattern
' find_pattern in the string within_text, if a match is found.
' Optional long start_num specifies the number of the character in within_text
' to start matching. Default=0.
' Optional boolean case_sensitive makes the pattern case sensitive if true,
' insensitive otherwise. Default=true.

Dim objRegex As VBScript_RegExp_55.RegExp
Dim colMatch As VBScript_RegExp_55.MatchCollection
Dim vbsMatch As VBScript_RegExp_55.Match
Dim sMatchString As String

Set objRegex = New VBScript_RegExp_55.RegExp

' Initialise Regex object
With objRegex
    .Global = False
    ' Default is case sensitive
    If case_sensitive Then
        .IgnoreCase = False
    Else: .IgnoreCase = True
    End If
    .pattern = find_pattern
End With

' Return out of bounds error
If start_num >= Len(within_text) Then
    RXFIND = CVErr(xlErrNum)
    Exit Function
End If

' Get string from char at start_num
sMatchString = Right$(within_text, Len(within_text) - start_num)

' Create Match collection
Set colMatch = objRegex.Execute(sMatchString)
If colMatch.Count = 0 Then ' No match
    RXFIND = 0
Else
    Set vbsMatch = colMatch(0)
    RXFIND = vbsMatch.FirstIndex + start_num + 1
End If
End Function

Public Function ISRXMATCH(ByRef find_pattern As Variant, _
                        ByRef within_text As Variant, _
                        Optional ByVal case_sensitive As Boolean = True) As Boolean
' ISRXMATCH - Returns true if the regex pattern find_pattern is matched
' in the string within_text, false otherwise.
' Optional boolean case_sensitive makes the pattern case sensitive if true,
' insensitive otherwise. Default=true.

Dim objRegex As VBScript_RegExp_55.RegExp

Set objRegex = New VBScript_RegExp_55.RegExp

' Initialise Regex object
With objRegex
    .Global = False
    ' Default is case sensitive
    If case_sensitive Then
        .IgnoreCase = False
    Else: .IgnoreCase = True
    End If
    .pattern = find_pattern
End With

' Test for pattern and return result
ISRXMATCH = objRegex.test(within_text)

End Function

Public Function RXGET(ByRef find_pattern As Variant, _
                        ByRef within_text As Variant, _
                        Optional ByVal submatch As Long = 0, _
                        Optional ByVal start_num As Long = 0, _
                        Optional ByVal case_sensitive As Boolean = True) As Variant
' RXGET - Looks for a match for regular expression pattern find_pattern
' in the string within_text and returns it if found, error otherwise.
' Optional long submatch may be used to return the corresponding submatch
' if specified - otherwise the entire match is returned.
' Optional long start_num specifies the number of the character to start
' searching for in within_text. Default=0.
' Optional boolean case_sensitive makes the regex pattern case sensitive
' if true, insensitive otherwise. Default=true.

Dim objRegex As VBScript_RegExp_55.RegExp
Dim colMatch As VBScript_RegExp_55.MatchCollection
Dim vbsMatch As VBScript_RegExp_55.Match
Dim colSubMatch As VBScript_RegExp_55.SubMatches
Dim sMatchString As String

Set objRegex = New VBScript_RegExp_55.RegExp

' Initialise Regex object
With objRegex
    .Global = False
    ' Default is case sensitive
    If case_sensitive Then
        .IgnoreCase = False
    Else: .IgnoreCase = True
    End If
    .pattern = find_pattern
End With

' Return out of bounds error
If start_num >= Len(within_text) Then
    RXGET = CVErr(xlErrNum)
    Exit Function
End If
sMatchString = Right$(within_text, Len(within_text) - start_num)

' Create Match collection
Set colMatch = objRegex.Execute(sMatchString)
If colMatch.Count = 0 Then ' No match
    RXGET = CVErr(xlErrNA)
Else
    Set vbsMatch = colMatch(0)
    If submatch = 0 Then ' Return match value
        RXGET = vbsMatch.Value
    Else
        Set colSubMatch = vbsMatch.SubMatches ' Use the submatch collection
        If colSubMatch.Count < submatch Then
            RXGET = CVErr(xlErrNum)
        Else
            RXGET = CStr(colSubMatch(submatch - 1))
        End If
    End If
End If
End Function

Public Function RXMATCH(ByVal find_pattern As Variant, _
                    ByVal within_range As Variant, _
                    Optional ByVal case_sensitive As Boolean = True) As Variant
' RXMATCH - Searches for the first cell matching regular expression pattern
' find_pattern in the range argument within_range.
' Optional boolean case_sensitive makes the regular expression case
' sensitive if true, insensitive otherwise. Default=true.
' within_range should be a single row or column range. If the range has
' multiple rows and columns, only the first row is searched.

Dim objRegex As VBScript_RegExp_55.RegExp
Dim lCell As Long   ' Stores index of cell in array within_range

' Initialise Regexp
Set objRegex = New VBScript_RegExp_55.RegExp
With objRegex
    .pattern = find_pattern
    .Global = False
    If Not case_sensitive Then
        .IgnoreCase = False
    Else: .IgnoreCase = True
    End If
End With

' Initialise return value, assume no match
RXMATCH = CVErr(xlErrNA)

' Create array from range, then transpose to create column vector if
' necessary
If Not IsArray(within_range) Then ' range is a single cell
    If objRegex.test(within_range) Then
        RXMATCH = 1
    Else
        Exit Function
    End If
Else
    within_range = within_range.Value
    If UBound(within_range, 2) > 1 Then _
        within_range = WorksheetFunction.Transpose(within_range)
    
    For lCell = LBound(within_range) To UBound(within_range)
        If objRegex.test(within_range(lCell, 1)) Then
            RXMATCH = lCell
            Exit For
        End If
    Next lCell

End If

End Function

Public Function RXSUB(ByRef within_text As Variant, _
                    ByRef old_text As Variant, _
                    ByRef new_text As Variant, _
                    Optional ByVal start_num As Long = 0, _
                    Optional ByVal case_sensitive As Boolean = True, _
                    Optional ByVal is_global As Boolean = False) As Variant
' RXSUB - Function to substitute text matching regex pattern old_text with
' string argument new_text in the string within_text.
' Optional long start_num specifies which character to start replacement from,
' default=0.
' Optional boolean case_sensitive will make the pattern case sensitive if true
' and insensitive otherwise. Default=true.
' Optional boolean is_global will substitute new_text for *all* occurrences
' matching old_text if true, only the first otherwise. Default=false.

Dim objRegex As VBScript_RegExp_55.RegExp
Dim sMatchString As String
Dim sResult As String

Set objRegex = New VBScript_RegExp_55.RegExp

' Initialise Regex object
With objRegex
    .Global = is_global
    If case_sensitive Then
        .IgnoreCase = False
    Else: .IgnoreCase = True
    End If
    .pattern = old_text
End With

' Return out of bounds error
If start_num > Len(within_text) Then
    RXSUB = CVErr(xlErrNum)
    Exit Function
End If

sMatchString = Right$(within_text, Len(within_text) - start_num)
sResult = objRegex.Replace(sMatchString, CStr(new_text))
RXSUB = Left$(within_text, start_num) & sResult

End Function
Public Function RXSCAN(ByVal stringval As String, ByVal n As Integer, _
                        Optional ByVal dlm As String = "") As Variant
' RXSCAN - Returns the nth word from stringval, where
' words are delimited by any non-word character by default,
' or a specific character set by optional string dlm.

Dim objRegex As VBScript_RegExp_55.RegExp
Dim colMatch As VBScript_RegExp_55.MatchCollection
Dim charlist As String
Dim char As String
Dim i As Integer

' Build pattern
If dlm = "" Then ' Just looking for A-Za-z0-9_
  charlist = "\w+"
Else ' Negate character class specified
  charlist = "[^"
  For i = 1 To Len(dlm)
    char = Mid$(dlm, i, 1)
    If InStr("[\^-", char) Then char = "\" & char ' escape
    charlist = charlist & char
  Next i
  charlist = charlist & "]+"
End If

' Initialise Regexp object, get match collection
Set objRegex = New VBScript_RegExp_55.RegExp
With objRegex
  .Pattern = charlist
  .Global = True
  Set colMatch = .Execute(stringval)
End With

If colMatch.Count = 0 Then ' No words detected
    RXSCAN = CVErr(xlErrNA)
Else
  If Abs(n) > colMatch.Count Or n = 0 Then
    RXSCAN = CVErr(xlErrNum)
  Else
    If n > 0 Then ' Read from left to right
      RXSCAN = colMatch(n - 1)
    Else ' Right to left
      RXSCAN = colMatch(colMatch.Count + n)
    End If
  End If
End If

End Function

Public Function RXCOUNTW(ByVal stringval As String, _
                        Optional ByVal dlm As String = "") As Variant
' RXCOUNTW - Returns the count of words from stringval, where
' words are delimited by any non-word character by default,
' or a specific character set by optional string dlm.
Dim objRegex As VBScript_RegExp_55.RegExp
Dim colMatch As VBScript_RegExp_55.MatchCollection
Dim charlist As String
Dim char As String
Dim i As Integer

' Build pattern
If dlm = "" Then ' Just looking for A-Za-z0-9_
  charlist = "\w+"
Else ' Negate character class specified
  charlist = "[^"
  For i = 1 To Len(dlm)
    char = Mid$(dlm, i, 1)
    If InStr("[\^-", char) Then char = "\" & char ' escape
    charlist = charlist & char
  Next i
  charlist = charlist & "]+"
End If

' Initialise Regexp object, get match collection
Set objRegex = New VBScript_RegExp_55.RegExp
With objRegex
  .Pattern = charlist
  .Global = True
  Set colMatch = .Execute(stringval)
End With

If colMatch.Count = 0 Then ' No words detected
  RXCOUNTW = CVErr(xlErrNA)
Else
  RXCOUNTW = colMatch.Count
End If

End Function
Leave a comment

12 Comments

  1. Andrew Butterworth

     /  September 10, 2009

    Great Functions. Exactly what I was about to write… Just not quiet as concise as these.

    Thanks again

    Reply
  2. Robbert

     /  September 12, 2009

    Hi Geoff,

    do you know if it is possible to utilize these regular expression functions in the (advanced/custom) filters in Excel lists (2003) or tables (2007)? Would be great if those filters could be specified as regular expressions instead of with Excels own limited wildcard system.

    I’ve been searching for this on google, but couldn’t find it so far.

    cheers,

    Robbert

    Reply
    • Hi Robbert

      sorry no, not directly in the native Excel filter – at least not as far as I know. However you could use ISRXMATCH in a helper column on the table and filter on that returning TRUE. It would also be possible to write a regex filter in VBA. That was something I wished to include with the add-in, along with a regex version of the Find/Replace dialog – but I haven’t got around to it so far. Keep an eye out for the next version…

      Reply
  3. I’ve written my own regex find and replace dialog (with undo) but I never got around to writing the utility functions you wrote.

    I just used rxget, thanks for packaging it as an addin.

    Example syntax for other readers:
    =RXGET(“(\-)(\d*)(\-)”,$C$2,2,,FALSE)

    which returns the second () expression, digits in between two hyphens.

    Reply
    • Thanks Tim, I’m really pleased you found these useful – and thanks for sharing the example! Along with many other things (including the find and replace dialog), I had planned to post some examples like this. Eventually. :)

      I’d be really interested in seeing your find and replace dialog…

      Reply
  4. Colin

     /  February 26, 2011

    You need to default your optional parameters differently, as IsMissing only works with Variant datatypes.

    Reply
  5. Horatio

     /  March 5, 2011

    Hi, I just need to do some searches in Excel using regular expressions and searching Google I’ve came accross your utility. Can you explain how to use it? I added it to my Excel as an add-in. How I use the functions? Their sytax?

    Reply
    • Hi Horatio – your suggestion is something I’ve threatened to do in the past but never have. I’ll try to add some examples later today. In the meantime, if by searches you mean you want to find a cell within a row or column range matching a regular expression pattern, try =RXMATCH(pattern, range). This will return the position of the first matching cell within the range, in a similar way to the built-in function MATCH.

      Feel free to email me directly (address at About Me) if you get stuck.

      Reply
  1. Regex Functions updated « Number Cruncher
  2. Building an Excel currency converter, part 2 « Number Cruncher

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

Follow

Get every new post delivered to your Inbox.

Join 202 other followers

%d bloggers like this: