--- pym/gentoolkit/helpers.py 2012-10-09 22:36:24.000000000 +0100 +++ pym/gentoolkit/helpers.py 2013-07-10 18:03:43.303888490 +0100 @@ -338,6 +338,46 @@ return paths + @staticmethod + def _filter_redundant_regexes(rs): + """ + Given a list of regular expressions to be joined via "|".join(), + removes those branches that have non-unique suffixes: + ['a$', 'ba$'] --> ['a$'] + for the purpose of improving efficiency. + + This is used in _prepare_search_regex to make filename matching + more efficient. + + @type rs: list + @param rs: file path regular expressions each ending with '$' + @rtype: list + @return: list with possibly fewer branches + + @raise ValueError: if a regular expression does not end with '$' + """ + + # sort in place + rs.sort(key= len, reverse= True) + rs_filtered = [] + + # keep a list item only if no suffix of it is present + for i in range(len(rs)): + found = False + for j in range(i + 1, len(rs)): + if rs[i].endswith(rs[j]): + found = True + break + if not found: + if not rs[i].endswith('$'): + raise ValueError( + "_filter_redundant_regexes needs all regexes to end with '$'") + rs_filtered.append(rs[i]) + + # we assume a search with very few hits; sort order is unimportant + return rs_filtered + + def _prepare_search_regex(self, queries): """Create a regex out of the queries""" @@ -357,7 +397,9 @@ else: query = "/%s$" % re.escape(query) result.append(query) - result = "|".join(result) + + # remove same redundancy here, 9% improvement in run time with "equery belongs" + result = "|".join( self._filter_redundant_regexes(result) ) return result # =========