From 34eb27f314a7b67a922623adb226f33f5b399ef7 Mon Sep 17 00:00:00 2001 From: Zac Medico Date: Fri, 17 Oct 2014 17:38:59 -0700 Subject: [PATCH] emerge --search: use description index This adds an egencache --update-pkg-desc-index action which generates a plain-text index of package names, versions, and descriptions. The index can then be used to optimize emerge --search / --searchdesc actions. If the package description index is missing from a particular repository, then all metadata for that repository is obtained using the normal pordbapi.aux_get method. Searching of installed packages is optimized to take advantage of vardbdbapi._aux_cache, which is backed by vardb_metadata.pickle. See the IndexedVardb docstring some more details. X-Gentoo-Bug: 525718 X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718 --- bin/egencache | 43 ++++++++++- man/egencache.1 | 4 ++ man/portage.5 | 6 ++ pym/_emerge/search.py | 196 ++++++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 232 insertions(+), 17 deletions(-) diff --git a/bin/egencache b/bin/egencache index e366058..90d5e68 100755 --- a/bin/egencache +++ b/bin/egencache @@ -57,7 +57,7 @@ from portage.util._async.run_main_scheduler import run_main_scheduler from portage.util._eventloop.global_event_loop import global_event_loop from portage import cpv_getkey from portage.dep import Atom, isjustname -from portage.versions import pkgsplit, vercmp +from portage.versions import pkgsplit, vercmp, _pkg_str try: from xml.etree import ElementTree @@ -91,6 +91,9 @@ def parse_args(args): actions.add_argument("--update-changelogs", action="store_true", help="update the ChangeLog files from SCM logs") + actions.add_argument("--update-pkg-desc-index", + action="store_true", + help="update package description index") actions.add_argument("--update-manifests", action="store_true", help="update manifests") @@ -451,6 +454,35 @@ class GenCache(object): if hasattr(trg_cache, '_prune_empty_dirs'): trg_cache._prune_empty_dirs() +class GenPkgDescIndex(object): + def __init__(self, portdb, output_file): + self.returncode = os.EX_OK + self._portdb = portdb + self._output_file = output_file + + def run(self): + + portage.util.ensure_dirs(os.path.dirname(self._output_file)) + f = portage.util.atomic_ofstream(self._output_file, + encoding=_encodings["repo.content"]) + + portdb = self._portdb + for cp in portdb.cp_all(): + pkgs = portdb.cp_list(cp) + if not pkgs: + continue + desc, = portdb.aux_get(pkgs[-1], ["DESCRIPTION"]) + + if len(pkgs) == 1: + output = "%s: %s\n" % (pkgs[0], desc) + else: + output = "%s,%s: %s\n" % (pkgs[0], + ",".join(_pkg_str(cpv).version + for cpv in pkgs[1:]), desc) + f.write(output) + + f.close() + class GenUseLocalDesc(object): def __init__(self, portdb, output=None, preserve_comments=False): @@ -893,7 +925,8 @@ def egencache_main(args): local_config=False, env=env) if not (options.update or options.update_use_local_desc or - options.update_changelogs or options.update_manifests): + options.update_changelogs or options.update_manifests or + options.update_pkg_desc_index): parser.error('No action specified') return 1 @@ -1057,6 +1090,12 @@ def egencache_main(args): else: ret.append(scheduler.returncode) + if options.update_pkg_desc_index: + gen_index = GenPkgDescIndex(portdb, os.path.join( + repo_config.location, "metadata", "pkg_desc_index")) + gen_index.run() + ret.append(gen_index.returncode) + if options.update_use_local_desc: gen_desc = GenUseLocalDesc(portdb, output=options.uld_output, diff --git a/man/egencache.1 b/man/egencache.1 index f71feb3..3a3197f 100644 --- a/man/egencache.1 +++ b/man/egencache.1 @@ -19,6 +19,10 @@ for the details on package atom syntax. .BR "\-\-update\-changelogs" Update the ChangeLog files from SCM logs (supported only in git repos). .TP +.BR "\-\-update\-pkg\-desc\-index" +Update the package description index which is located at +\fImetadata/pkg_desc_index\fR in the repository. +.TP .BR "\-\-update\-use\-local\-desc" Update the \fIprofiles/use.local.desc\fR file from metadata.xml. .TP diff --git a/man/portage.5 b/man/portage.5 index e399f0f..26856d1 100644 --- a/man/portage.5 +++ b/man/portage.5 @@ -75,6 +75,7 @@ user\-defined package sets .BR /usr/portage/metadata/ .nf layout.conf +pkg_desc_index .fi .TP .BR /usr/portage/profiles/ @@ -1110,6 +1111,11 @@ cache\-formats = md5-dict pms profile\-formats = portage-2 .fi .RE +.TP +.BR pkg_desc_index +This is an index of packages and descriptions which may be generated +by \fBegencache\fR(1) in order to optimize \fBemerge\fR(1) search +actions. .RE .TP .BR /usr/portage/profiles/ diff --git a/pym/_emerge/search.py b/pym/_emerge/search.py index 4b0fd9f..bf15f11 100644 --- a/pym/_emerge/search.py +++ b/pym/_emerge/search.py @@ -3,13 +3,17 @@ from __future__ import print_function +import io import re import portage -from portage import os +from portage import os, _encodings from portage.dbapi.porttree import _parse_uri_map +from portage.dep import Atom +from portage.exception import InvalidData from portage.localization import localized_size from portage.output import bold, bold as white, darkgreen, green, red from portage.util import writemsg_stdout +from portage.versions import _pkg_str from _emerge.Package import Package @@ -30,7 +34,6 @@ class search(object): The list of available and installed packages is created at object instantiation. This makes successive searches faster.""" self.settings = root_config.settings - self.vartree = root_config.trees["vartree"] self.spinner = spinner self.verbose = verbose self.searchdesc = searchdesc @@ -41,9 +44,9 @@ class search(object): self._dbs = [] - portdb = root_config.trees["porttree"].dbapi + portdb = IndexedPortdb(root_config.trees["porttree"].dbapi) bindb = root_config.trees["bintree"].dbapi - vardb = root_config.trees["vartree"].dbapi + vardb = IndexedVardb(root_config.trees["vartree"].dbapi) if not usepkgonly and portdb._have_root_eclass_dir: self._dbs.append(portdb) @@ -53,6 +56,7 @@ class search(object): self._dbs.append(vardb) self._portdb = portdb + self._vardb = vardb def _spinner_update(self): if self.spinner: @@ -97,7 +101,7 @@ class search(object): return {} def _visible(self, db, cpv, metadata): - installed = db is self.vartree.dbapi + installed = db is self._vardb built = installed or db is not self._portdb pkg_type = "ebuild" if installed: @@ -208,6 +212,20 @@ class search(object): masked=1 self.matches["pkg"].append([package,masked]) elif self.searchdesc: # DESCRIPTION searching + # Check for DESCRIPTION match first, so that we can skip + # the expensive visiblity check if it doesn't match. + full_package = portage.best( + self._xmatch("match-all", package)) + try: + full_desc = self._aux_get( + full_package, ["DESCRIPTION"])[0] + except KeyError: + portage.writemsg( + "emerge: search: aux_get() failed, skipping\n", + noiselevel=-1) + continue + if not self.searchre.search(full_desc): + continue full_package = self._xmatch("bestmatch-visible", package) if not full_package: #no match found; we don't want to query description @@ -217,14 +235,8 @@ class search(object): continue else: masked=1 - try: - full_desc = self._aux_get( - full_package, ["DESCRIPTION"])[0] - except KeyError: - print("emerge: search: aux_get() failed, skipping") - continue - if self.searchre.search(full_desc): - self.matches["desc"].append([full_package,masked]) + + self.matches["desc"].append((full_package, masked)) self.sdict = self.setconfig.getSets() for setname in self.sdict: @@ -262,7 +274,7 @@ class search(object): bold(self.searchkey) + " ]\n") msg.append("[ Applications found : " + \ bold(str(self.mlen)) + " ]\n\n") - vardb = self.vartree.dbapi + vardb = self._vardb metadata_keys = set(Package.metadata_keys) metadata_keys.update(["DESCRIPTION", "HOMEPAGE", "LICENSE", "SRC_URI"]) metadata_keys = tuple(metadata_keys) @@ -372,7 +384,11 @@ class search(object): # private interface # def getInstallationStatus(self,package): - installed_package = self.vartree.dep_bestmatch(package) + installed_package = self._vardb.match(package) + if installed_package: + installed_package = installed_package[-1] + else: + installed_package = "" result = "" version = self.getVersion(installed_package,search.VERSION_RELEASE) if len(version) > 0: @@ -392,3 +408,153 @@ class search(object): result = "" return result + +class IndexedPortdb(object): + """ + A portdbapi interface that uses a package description index to + improve performance. If the description index is missing for a + particular repository, then all metadata for that repository is + obtained using the normal pordbapi.aux_get method. + """ + def __init__(self, portdb): + self._portdb = portdb + self.cpv_exists = portdb.cpv_exists + self.getFetchMap = portdb.getFetchMap + self.findname = portdb.findname + self._aux_cache_keys = portdb._aux_cache_keys + self._have_root_eclass_dir = portdb._have_root_eclass_dir + self._cpv_sort_ascending = portdb._cpv_sort_ascending + self._desc_cache = None + self._cp_map = None + + def _init_index(self): + cp_map = {} + desc_cache = {} + for repo_path in self._portdb.porttrees: + outside_repo = os.path.join(self._portdb.depcachedir, + repo_path.lstrip(os.sep)) + for parent_dir in (repo_path, outside_repo): + file_path = os.path.join(parent_dir, + "metadata", "pkg_desc_index") + + try: + with io.open(file_path, + encoding=_encodings["repo.content"]) as f: + for line in f: + pkgs, desc = line.split(":", 1) + desc = desc.strip() + pkgs = pkgs.split(",") + if not pkgs[0]: + continue + try: + pkg = _pkg_str(pkgs[0]) + except InvalidData: + continue + cp_list = cp_map.get(pkg.cp) + if cp_list is None: + cp_list = [] + cp_map[pkg.cp] = cp_list + cp_list.append(pkg) + for ver in pkgs[1:]: + try: + cp_list.append( + _pkg_str(pkg.cp + "-" + ver)) + except InvalidData: + pass + for cpv in cp_list: + desc_cache[cpv] = desc + except IOError: + pass + else: + break + else: + # No descriptions index was found, so populate + # cp_map the slow way. + for cp in self._portdb.cp_all(trees=[repo_path]): + cp_list = cp_map.get(cp) + if cp_list is None: + cp_list = [] + cp_map[cp] = cp_list + for cpv in self._portdb.cp_list(cp, mytree=repo_path): + if cpv not in cp_list: + cp_list.append(_pkg_str(cpv)) + + self._desc_cache = desc_cache + self._cp_map = cp_map + + def cp_all(self): + if self._cp_map is None: + self._init_index() + return list(self._cp_map) + + def match(self, atom): + if not isinstance(atom, Atom): + atom = Atom(atom) + cp_list = self._cp_map.get(atom.cp) + if cp_list is None: + return [] + self._portdb._cpv_sort_ascending(cp_list) + return portage.match_from_list(atom, cp_list) + + def aux_get(self, cpv, attrs, myrepo = None): + if len(attrs) == 1 and attrs[0] == "DESCRIPTION": + try: + return [self._desc_cache[cpv]] + except KeyError: + pass + return self._portdb.aux_get(cpv, attrs) + + +class IndexedVardb(object): + """ + A vardbapi interface that sacrifices validation in order to + improve performance. It takes advantage of vardbdbapi._aux_cache, + which is backed by vardb_metadata.pickle. Since _aux_cache is + not updated for every single merge/unmerge (see + _aux_cache_threshold), the list of packages is obtained directly + from the real vardbapi instance. If a package is missing from + _aux_cache, then its metadata is obtained using the normal + (validated) vardbapi.aux_get method. + """ + def __init__(self, vardb): + self._vardb = vardb + self._aux_cache_keys = vardb._aux_cache_keys + self._cpv_sort_ascending = vardb._cpv_sort_ascending + self._cp_map = {} + self.cpv_exists = vardb.cpv_exists + + def cp_all(self): + if self._cp_map: + return list(self._cp_map) + cp_map = self._cp_map + for cpv in self._vardb.cpv_all(): + cp = portage.cpv_getkey(cpv) + if cp is not None: + cp_list = cp_map.get(cp) + if cp_list is None: + cp_list = [] + cp_map[cp] = cp_list + cp_list.append(_pkg_str(cpv)) + return list(cp_map) + + def match(self, atom): + if not isinstance(atom, Atom): + atom = Atom(atom) + cp_list = self._cp_map.get(atom.cp) + if cp_list is None: + return [] + self._vardb._cpv_sort_ascending(cp_list) + return portage.match_from_list(atom, cp_list) + + def aux_get(self, cpv, attrs, myrepo = None): + pkg_data = self._vardb._aux_cache["packages"].get(cpv) + if not isinstance(pkg_data, tuple) or \ + len(pkg_data) != 2 or \ + not isinstance(pkg_data[1], dict): + pkg_data = None + if pkg_data is None: + # It may be missing from _aux_cache due to + # _aux_cache_threshold. + return self._vardb.aux_get(cpv, attrs) + metadata = pkg_data[1] + return [metadata.get(k, "") for k in attrs] -- 2.0.4