op_regex.cpp

Go to the documentation of this file.
00001 
00013 #include <cerrno>
00014 
00015 #include <iostream>
00016 #include <fstream>
00017 
00018 #include "string_manip.h"
00019 
00020 #include "op_regex.h"
00021 
00022 using namespace std;
00023 
00024 namespace {
00025 
00026 string op_regerror(int err, regex_t const & regexp)
00027 {
00028     size_t needed_size = regerror(err, &regexp, 0, 0);
00029     char * buffer = new char[needed_size];
00030     regerror(err, &regexp, buffer, needed_size);
00031 
00032     return buffer;
00033 }
00034 
00035 
00036 void op_regcomp(regex_t & regexp, string const & pattern)
00037 {
00038     int err = regcomp(&regexp, pattern.c_str(), REG_EXTENDED);
00039     if (err) {
00040         throw bad_regex("regcomp error: " + op_regerror(err, regexp)
00041                 + " for pattern : " + pattern);
00042     }
00043 }
00044 
00045 
00046 bool op_regexec(regex_t const & regex, string const & str, regmatch_t * match,
00047            size_t nmatch)
00048 {
00049     return regexec(&regex, str.c_str(), nmatch, match, 0) != REG_NOMATCH;
00050 }
00051 
00052 
00053 void op_regfree(regex_t & regexp)
00054 {
00055     regfree(&regexp);
00056 }
00057 
00058 
00059 // return the index number associated with a char seen in a "\x".
00060 // Allowed range are for x is [0-9a-z] return size_t(-1) if x is not in
00061 // these ranges.
00062 size_t subexpr_index(char ch)
00063 {
00064     if (isdigit(ch))
00065         return ch - '0';
00066     if (ch >= 'a' && ch <= 'z')
00067         return ch - 'a' + 10;
00068     return size_t(-1);
00069 }
00070 
00071 }  // anonymous namespace
00072 
00073 
00074 bad_regex::bad_regex(string const & pattern)
00075     : op_exception(pattern)
00076 {
00077 }
00078 
00079 
00080 regular_expression_replace::regular_expression_replace(size_t limit_,
00081                                size_t limit_defs)
00082     :
00083     limit(limit_),
00084     limit_defs_expansion(limit_defs)
00085 {
00086 }
00087 
00088 
00089 regular_expression_replace::~regular_expression_replace()
00090 {
00091     for (size_t i = 0 ; i < regex_replace.size() ; ++i)
00092         op_regfree(regex_replace[i].regexp);
00093 }
00094 
00095 
00096 void regular_expression_replace::add_definition(string const & name,
00097                         string const & definition)
00098 {
00099     defs[name] = expand_string(definition);
00100 }
00101 
00102 
00103 void regular_expression_replace::add_pattern(string const & pattern,
00104                          string const & replace)
00105 {
00106     string expanded_pattern = expand_string(pattern);
00107 
00108     regex_t regexp;
00109     op_regcomp(regexp, expanded_pattern);
00110     replace_t regex = { regexp, replace };
00111     regex_replace.push_back(regex);
00112 }
00113 
00114 
00115 string regular_expression_replace::expand_string(string const & input)
00116 {
00117     string last, expanded(input);
00118     size_t i = 0;
00119     for (i = 0 ; i < limit_defs_expansion ; ++i) {
00120         last = expanded;
00121         expanded = substitute_definition(last);
00122         if (expanded == last)
00123             break;
00124     }
00125 
00126     if (i == limit_defs_expansion)
00127         throw bad_regex("too many substitution for: + input");
00128 
00129     return last;
00130 }
00131 
00132 
00133 string regular_expression_replace::substitute_definition(string const & pattern)
00134 {
00135     string result;
00136     bool previous_is_escape = false;
00137 
00138     for (size_t i = 0 ; i < pattern.length() ; ++i) {
00139         if (pattern[i] == '$' && !previous_is_escape) {
00140             size_t pos = pattern.find('{', i);
00141             if (pos != i + 1) {
00142                 throw bad_regex("invalid $ in pattern: " + pattern);
00143             }
00144             size_t end = pattern.find('}', i);
00145             if (end == string::npos) {
00146                 throw bad_regex("no matching '}' in pattern: " + pattern);
00147             }
00148             string def_name = pattern.substr(pos+1, (end-pos) - 1);
00149             if (defs.find(def_name) == defs.end()) {
00150                 throw bad_regex("definition not found and used in pattern: ("
00151                         + def_name + ") " + pattern);
00152             }
00153             result += defs[def_name];
00154             i = end;
00155         } else {
00156             if (pattern[i] == '\\' && !previous_is_escape)
00157                 previous_is_escape = true;
00158             else
00159                 previous_is_escape = false;
00160             result += pattern[i];
00161         }
00162     }
00163 
00164     return result;
00165 }
00166 
00167 
00168 // FIXME limit output string size ? (cause we can have exponential growing
00169 // of output string through a rule "a" = "aa")
00170 bool regular_expression_replace::execute(string & str) const
00171 {
00172     bool changed = true;
00173     for (size_t nr_iter = 0; changed && nr_iter < limit ; ++nr_iter) {
00174         changed = false;
00175         for (size_t i = 0 ; i < regex_replace.size() ; ++i) {
00176             if (do_execute(str, regex_replace[i]))
00177                 changed = true;
00178         }
00179     }
00180 
00181     // this don't return if the input string has been changed but if
00182     // we reach the limit number of iteration.
00183     return changed == false;
00184 }
00185 
00186 
00187 bool regular_expression_replace::do_execute(string & str,
00188                                             replace_t const & regexp) const
00189 {
00190     bool changed = false;
00191 
00192     regmatch_t match[max_match];
00193     for (size_t iter = 0;
00194          op_regexec(regexp.regexp, str, match, max_match) && iter < limit;
00195          iter++) {
00196         changed = true;
00197         do_replace(str, regexp.replace, match);
00198     }
00199 
00200     return changed;
00201 }
00202 
00203 
00204 regmatch_t const &
00205 regular_expression_replace::get_match(regmatch_t const * match, char idx) const
00206 {
00207     size_t sub_expr = subexpr_index(idx);
00208     if (sub_expr == size_t(-1))
00209         throw bad_regex("expect group index: " + idx);
00210     if (sub_expr >= max_match)
00211         throw bad_regex("illegal group index :" + idx);
00212     return match[sub_expr];
00213 }
00214 
00215 void regular_expression_replace::do_replace
00216 (string & str, string const & replace, regmatch_t const * match) const
00217 {
00218     string inserted;
00219     for (size_t i = 0 ; i < replace.length() ; ++i) {
00220         if (replace[i] == '\\') {
00221             if (i == replace.length() - 1) {
00222                 throw bad_regex("illegal \\ trailer: " +
00223                                 replace);
00224             }
00225             ++i;
00226             if (replace[i] == '\\') {
00227                 inserted += '\\';
00228             }  else {
00229                 regmatch_t const & matched = get_match(match,
00230                     replace[i]);
00231                 if (matched.rm_so == -1 && 
00232                     matched.rm_eo == -1) {
00233                     // empty match: nothing todo
00234                 } else if (matched.rm_so == -1 ||
00235                        matched.rm_eo == -1) {
00236                     throw bad_regex("illegal match: " +
00237                         replace);
00238                 } else {
00239                     inserted += str.substr(matched.rm_so,
00240                         matched.rm_eo - matched.rm_so);
00241                 }
00242             }
00243         } else {
00244             inserted += replace[i];
00245         }
00246     }
00247 
00248     size_t first = match[0].rm_so;
00249     size_t count = match[0].rm_eo - match[0].rm_so;
00250 
00251     str.replace(first, count, inserted);
00252 }
00253 
00254 
00255 void setup_regex(regular_expression_replace & regex,
00256                  string const & filename)
00257 {
00258     ifstream in(filename.c_str());
00259     if (!in) {
00260         throw op_runtime_error("Can't open file " + filename +
00261                 " for reading", errno);
00262     }
00263 
00264     regular_expression_replace var_name_rule;
00265     var_name_rule.add_pattern("^\\$([_a-zA-Z][_a-zA-Z0-9]*)[ ]*=.*", "\\1");
00266     regular_expression_replace var_value_rule;
00267     var_value_rule.add_pattern(".*=[ ]*\"(.*)\"", "\\1");
00268 
00269     regular_expression_replace left_rule;
00270     left_rule.add_pattern("[ ]*\"(.*)\"[ ]*=.*", "\\1");
00271     regular_expression_replace right_rule;
00272     right_rule.add_pattern(".*=[ ]*\"(.*)\"", "\\1");
00273 
00274     string line;
00275     while (getline(in, line)) {
00276         line = trim(line);
00277         if (line.empty() || line[0] == '#')
00278             continue;
00279 
00280         string temp = line;
00281         var_name_rule.execute(temp);
00282         if (temp == line) {
00283             string left = line;
00284             left_rule.execute(left);
00285             if (left == line) {
00286                 throw bad_regex("invalid input file: \"" + line + '"');
00287             }
00288 
00289             string right = line;
00290             right_rule.execute(right);
00291             if (right == line) {
00292                 throw bad_regex("invalid input file: \"" + line + '"');
00293             }
00294 
00295             regex.add_pattern(left, right);
00296         } else {
00297             // temp != line ==> var_name_rule succeed to substitute
00298             // into temp the var_name present in line
00299             string var_name = temp;
00300             string var_value = line;
00301             var_value_rule.execute(var_value);
00302             if (var_value == line) {
00303                 throw bad_regex("invalid input file: \"" + line + '"');
00304             }
00305 
00306             regex.add_definition(var_name, var_value);
00307         }
00308     }
00309 }

Generated on 8 Nov 2012 for Oprofile by  doxygen 1.6.1