#!perl # Convert a Perl regex to a VBA implementation somewhat like that regex. # Copyright (C) Chris White 2018. Licensed MIT. # use 5.018; use strict; use warnings; use Data::Dumper; use Carp; use Getopt::Long; use Pod::Usage; use constant { true => !!1, false => !!0 }; use constant EXIT_OK => 0; # success use constant EXIT_PROC_ERR => 1; # error during processing use constant EXIT_PARAM_ERR => 2; # couldn't understand the command line exit Main(); # Test and save a piece sub stash_piece { my ($hrPieces, $piecename, $piecetext) = @_; eval { my $re = qr{$piecetext} }; croak "$piecename is not a valid regex: $@" if $@; # We don't support all group types my @bad_groups = $piecetext =~ m{ (?<!\\) # escaped \( are OK \(\? # begin a special group [^:<] # we only support non-capturing (?:) and named (?<>) }gx; croak "Unsupported groups: ", join(' ', @bad_groups) if @bad_groups; $hrPieces->{$piecename} = $piecetext; } #stash_piece() sub Main { # Args my %opts = (dim=>true, private=>false, quiet=>false, indent=>true); GetOptions(\%opts, 'usage|?', 'help|h', 'man', # options we handle here "dim!", # whether to print declarations "private", # if true, use Private instead of Dim. Ignored if --nodim "quiet|q", "indent!", # if true, indent the output. ) or pod2usage(-verbose => 0, -exitval => EXIT_PARAM_ERR); # unknown opt # Help, if requested pod2usage(-verbose => 0, -exitval => EXIT_PROC_ERR) if $opts{usage}; pod2usage(-verbose => 1, -exitval => EXIT_PROC_ERR) if $opts{help}; pod2usage(-verbose => 2, -exitval => EXIT_PROC_ERR) if $opts{man}; # Main input loop my %pieces; my $piecename=""; my $piecetext=""; my $mainpiece=""; while(<>) { chomp; next if /^\s*#/; last if /^__END__\b/; s{\(\?#[^\)]*\)}{}g; # remove comment groups s{\s+$}{}; # remove trailing whitespace my @fields = split(/\s+/, $_, 2); say STDERR "${.}: ", join(':',@fields,'') unless $opts{quiet}; if($fields[0] && $fields[1]) { # a new piece if($piecename && $piecetext) { # Finish the last piece stash_piece \%pieces, $piecename, $piecetext; $mainpiece = $piecename unless $mainpiece; } $piecename = $fields[0]; # start the new piece $piecetext = $fields[1]; } elsif($fields[1]) { # empty fields[0] => continuation $piecetext .= $fields[1]; } } #main input loop # Finish the last piece, if any if($piecename && $piecetext) { stash_piece \%pieces, $piecename, $piecetext; $mainpiece = $piecename unless $mainpiece; } say STDERR "Found pieces: ", Dumper(\%pieces) unless $opts{quiet}; die "No main piece found" unless $mainpiece && $pieces{$mainpiece}; # Assemble the pieces into one regex my $full_regex = $pieces{$mainpiece}; while($full_regex =~ s{\(\?<=([^\)]+)\)}{$pieces{$1}}gx) { die "Unknown piece $1" unless $pieces{$1}; } say STDERR "Full regex is -$full_regex-" unless $opts{quiet}; # Disallow \\( so I don't have to count backslashes to see if it's # even or odd. if($full_regex =~ m{\\\\\(}) { # Mark the error location say STDERR "Full regex is -$full_regex-" if $opts{quiet}; my $spacer = ' ' . (' ' x $-[0]); say STDERR $spacer, '^'; die 'Unfortunately, I can\'t handle `\\\\(` (you can insert `.{0,0}`' . ' as a spacer if necessary)'; } # Find named- or non-capturing groups in the regexes. my %names; my $groupidx=0; while( $full_regex =~ m{ (?<!\\) # Ignore escaped parens. # NOTE: this fails for `\\(foo)`, which should not be ignored. # TODO see https://stackoverflow.com/q/9613522/2877364 \( # open a group (?| (\?:) # It's a non-capturing group | (\?<([^>]+)>) # It's a named capturing group )? }gx) { my ($match_start, $match_end) = ($-[0], $+[0]); my $pos = pos $full_regex; my ($type_start, $type_end) = ($-[1], $+[1]); my $group_type = $1; # may be undef my ($name_start, $name_end) = ($-[2], $+[2]); my $group_name = $2; # may be undef if($group_type) { # Remove the group type, since VBScript can't handle those substr($full_regex, $type_start, $type_end-$type_start) = ''; pos($full_regex) = $pos - ($type_end-$type_start); # Stash offset for named groups. Note: for multiple occurrences # of a group name, only the last will be preserved. $names{$groupidx} = $group_name if $group_name; } ++$groupidx; } # for each group # Escape the double-quotes for VBA $full_regex =~ s{"}{""}g; # Process the definitions, and print them if desired my $I = ($opts{indent} ? ' ' : ''); # indent string my $I4 = $I x 4; my $I8 = $I x 8; say <<"EOT"; $I4' The following code is from the output of \`re2vba.pl vim-regex.txt\`. $I4' DO NOT MODIFY HERE. If you need to change it, modify vim-regex.txt $I4' and re-run re2vba.pl. EOT for(my $idx=0; $idx < $groupidx; ++$idx) { next unless exists $names{$idx}; my $name = $names{$idx}; $name = uc $name; $name =~ s{[^a-zA-Z0-9]}{_}g; $names{$idx} = $name; say(($opts{private} ? 'Private ' : ($I4) . 'Dim '), "RESM_$name As Long") if $opts{dim}; } say(($opts{private} ? 'Private ' : ($I4) . 'Dim '), "RE_PAT As String\n") if $opts{dim}; # Print the regex, with lines broken say $I4, "RE_PAT = _"; while($full_regex =~ m{(.{0,60})}g) { say $I8, "\"$1\" & _" if $1; } say $I8, "\"\""; # Print the submatch numbers for(my $idx=0; $idx < $groupidx; ++$idx) { say $I4, "RESM_$names{$idx} = $idx" if exists $names{$idx}; } say "\n${I4}' End of generated code"; return 0; } #Main() __END__ =pod =head1 NAME re2vba.pl - Convert a Perl regex to a VBA implementation somewhat like that regex. =head1 USAGE re2vba.pl [-options] [input files (stdin if none is given)] =head1 OPTIONS =over =item --nodim If given on the command line, do not print the C<Dim> statements. =item --private If given, use C<Private> instead of C<Dim>. =item -q, --quiet Do not print the diagnostic messages while running. =item --noindent If given on the command line, do not indent the output. =back =head1 INPUT FORMAT # comment to eol (hash must be first non-whitespace on the line) <piece name> <regex text> [<ws> <regex text continued>] __END__ (end of file) Comment groups (C<(?#...)>) can be used, but must not span lines. Whitespace remaining at the end of any line after removing comment groups is ignored. The first piece given is the main one. In each piece of regex text, C<< (?<name>) >> defines a group that will be captured and given a submatch number. Backreferences are not currently processed. C<< (?<=piecename) >> is replaced with the text of piece C<piecename>. (In a real regex, that would be a positive lookbehind assertion, but VBScript doesn't support those, so we can repurpose it.) Each piece can be referenced only once. =head2 Example input # piece3 is the main regex piece3 (?<=piece1)text here to make it long(?<=piece2)(?:foo)?(bar)? (?<lastone>9+) piece2 ((?<upperalpha>[A-Z])(?<p2>something))\\a+ "[0-9]" piece1 [a-z](?<p1>thing) (?#comment!) __END__ =head1 OUTPUT FORMAT The tool outputs diagnostics on STDERR (unless --quite) and VBA source on STDOUT. The VBA source defines C<RE_PAT>, which is a string of the regex pattern. It also defines C<RESM_*> variables as C<Long>. Those are the submatch numbers of the various named groups in the input. Each named group is uppercased, and all non-letter/non-digit characters are replaced with underscores. =head2 Example output Dim RE_PAT As String Dim RESM_P1 As Long Dim RESM_UPPERALPHA As Long Dim RESM_P2 As Long Dim RESM_LASTONE As Long RE_PAT = _ "[a-z](thing)text here to make it long(([A-Z])(something))\\a" & _ "+""[0-9]""(foo)?(bar)?(9+)" & _ "" RESM_P1 = 0 RESM_UPPERALPHA = 2 RESM_P2 = 3 RESM_LASTONE = 6 =head1 COPYRIGHT Copyright (C) Chris White 2018. Licensed Artistic 2.0. =cut # vi: set ts=4 sts=4 sw=4 et ai ff=unix: #