|
| 1 | +//! A `Source` for registry-based packages. |
| 2 | +//! |
| 3 | +//! # What's a Registry? |
| 4 | +//! |
| 5 | +//! Registries are central locations where packages can be uploaded to, |
| 6 | +//! discovered, and searched for. The purpose of a registry is to have a |
| 7 | +//! location that serves as permanent storage for versions of a crate over time. |
| 8 | +//! |
| 9 | +//! Compared to git sources, a registry provides many packages as well as many |
| 10 | +//! versions simultaneously. Git sources can also have commits deleted through |
| 11 | +//! rebasings where registries cannot have their versions deleted. |
| 12 | +//! |
| 13 | +//! # The Index of a Registry |
| 14 | +//! |
| 15 | +//! One of the major difficulties with a registry is that hosting so many |
| 16 | +//! packages may quickly run into performance problems when dealing with |
| 17 | +//! dependency graphs. It's infeasible for cargo to download the entire contents |
| 18 | +//! of the registry just to resolve one package's dependencies, for example. As |
| 19 | +//! a result, cargo needs some efficient method of querying what packages are |
| 20 | +//! available on a registry, what versions are available, and what the |
| 21 | +//! dependencies for each version is. |
| 22 | +//! |
| 23 | +//! One method of doing so would be having the registry expose an HTTP endpoint |
| 24 | +//! which can be queried with a list of packages and a response of their |
| 25 | +//! dependencies and versions is returned. This is somewhat inefficient however |
| 26 | +//! as we may have to hit the endpoint many times and we may have already |
| 27 | +//! queried for much of the data locally already (for other packages, for |
| 28 | +//! example). This also involves inventing a transport format between the |
| 29 | +//! registry and Cargo itself, so this route was not taken. |
| 30 | +//! |
| 31 | +//! Instead, Cargo communicates with registries through a git repository |
| 32 | +//! referred to as the Index. The Index of a registry is essentially an easily |
| 33 | +//! query-able version of the registry's database for a list of versions of a |
| 34 | +//! package as well as a list of dependencies for each version. |
| 35 | +//! |
| 36 | +//! Using git to host this index provides a number of benefits: |
| 37 | +//! |
| 38 | +//! * The entire index can be stored efficiently locally on disk. This means |
| 39 | +//! that all queries of a registry can happen locally and don't need to touch |
| 40 | +//! the network. |
| 41 | +//! |
| 42 | +//! * Updates of the index are quite efficient. Using git buys incremental |
| 43 | +//! updates, compressed transmission, etc for free. The index must be updated |
| 44 | +//! each time we need fresh information from a registry, but this is one |
| 45 | +//! update of a git repository that probably hasn't changed a whole lot so |
| 46 | +//! it shouldn't be too expensive. |
| 47 | +//! |
| 48 | +//! Additionally, each modification to the index is just appending a line at |
| 49 | +//! the end of a file (the exact format is described later). This means that |
| 50 | +//! the commits for an index are quite small and easily applied/compressable. |
| 51 | +//! |
| 52 | +//! ## The format of the Index |
| 53 | +//! |
| 54 | +//! The index is a store for the list of versions for all packages known, so its |
| 55 | +//! format on disk is optimized slightly to ensure that `ls registry` doesn't |
| 56 | +//! produce a list of all packages ever known. The index also wants to ensure |
| 57 | +//! that there's not a million files which may actually end up hitting |
| 58 | +//! filesystem limits at some point. To this end, a few decisions were made |
| 59 | +//! about the format of the registry: |
| 60 | +//! |
| 61 | +//! 1. Each crate will have one file corresponding to it. Each version for a |
| 62 | +//! crate will just be a line in this file. |
| 63 | +//! 2. There will be two tiers of directories for crate names, under which |
| 64 | +//! crates corresponding to those tiers will be located. |
| 65 | +//! |
| 66 | +//! As an example, this is an example hierarchy of an index: |
| 67 | +//! |
| 68 | +//! ```notrust |
| 69 | +//! . |
| 70 | +//! ├── 3 |
| 71 | +//! │ └── u |
| 72 | +//! │ └── url |
| 73 | +//! ├── bz |
| 74 | +//! │ └── ip |
| 75 | +//! │ └── bzip2 |
| 76 | +//! ├── config.json |
| 77 | +//! ├── en |
| 78 | +//! │ └── co |
| 79 | +//! │ └── encoding |
| 80 | +//! └── li |
| 81 | +//! ├── bg |
| 82 | +//! │ └── libgit2 |
| 83 | +//! └── nk |
| 84 | +//! └── link-config |
| 85 | +//! ``` |
| 86 | +//! |
| 87 | +//! The root of the index contains a `config.json` file with a few entries |
| 88 | +//! corresponding to the registry (see `RegistryConfig` below). |
| 89 | +//! |
| 90 | +//! Otherwise, there are three numbered directories (1, 2, 3) for crates with |
| 91 | +//! names 1, 2, and 3 characters in length. The 1/2 directories simply have the |
| 92 | +//! crate files underneath them, while the 3 directory is sharded by the first |
| 93 | +//! letter of the crate name. |
| 94 | +//! |
| 95 | +//! Otherwise the top-level directory contains many two-letter directory names, |
| 96 | +//! each of which has many sub-folders with two letters. At the end of all these |
| 97 | +//! are the actual crate files themselves. |
| 98 | +//! |
| 99 | +//! The purpose of this layou tis to hopefully cut down on `ls` sizes as well as |
| 100 | +//! efficient lookup based on the crate name itself. |
| 101 | +//! |
| 102 | +//! ## Crate files |
| 103 | +//! |
| 104 | +//! Each file in the index is the history of one crate over time. Each line in |
| 105 | +//! the file corresponds to one version of a crate, stored in JSON format (see |
| 106 | +//! the `RegistryPackage` structure below). |
| 107 | +//! |
| 108 | +//! As new versions are published, new lines are appended to this file. The only |
| 109 | +//! modifications to this file that should happen over time are yanks of a |
| 110 | +//! particular version. |
| 111 | +//! |
| 112 | +//! # Downloading Packages |
| 113 | +//! |
| 114 | +//! The purpose of the Index was to provide an efficient method to resolve the |
| 115 | +//! dependency graph for a package. So far we only required one network |
| 116 | +//! interaction to update the registry's repository (yay!). After resolution has |
| 117 | +//! been performed, however we need to download the contents of packages so we |
| 118 | +//! can read the full manifest and build the source code. |
| 119 | +//! |
| 120 | +//! To accomplish this, this source's `download` method will make an HTTP |
| 121 | +//! request per-package requested to download tarballs into a local cache. These |
| 122 | +//! tarballs will then be unpacked into a destination folder. |
| 123 | +//! |
| 124 | +//! Note that because versions uploaded to the registry are frozen forever that |
| 125 | +//! the HTTP download and unpacking can all be skipped if the version has |
| 126 | +//! already been downloaded and unpacked. This caching allows us to only |
| 127 | +//! download a package when absolutely necessary. |
| 128 | +//! |
| 129 | +//! # Filesystem Hierarchy |
| 130 | +//! |
| 131 | +//! Overall, the `$HOME/.cargo` looks like this when talking about the registry: |
| 132 | +//! |
| 133 | +//! ```notrust |
| 134 | +//! # A folder under which all registry metadata is hosted (similar to |
| 135 | +//! # $HOME/.cargo/git) |
| 136 | +//! $HOME/.cargo/registry/ |
| 137 | +//! |
| 138 | +//! # For each registry that cargo knows about (keyed by hostname + hash) |
| 139 | +//! # there is a folder which is the checked out version of the index for |
| 140 | +//! # the registry in this location. Note that this is done so cargo can |
| 141 | +//! # support multiple registries simultaneously |
| 142 | +//! index/ |
| 143 | +//! registry1-<hash>/ |
| 144 | +//! registry2-<hash>/ |
| 145 | +//! ... |
| 146 | +//! |
| 147 | +//! # This folder is a cache for all downloaded tarballs from a registry. |
| 148 | +//! # Once downloaded and verified, a tarball never changes. |
| 149 | +//! cache/ |
| 150 | +//! registry1-<hash>/<pkg>-<version>.tar.gz |
| 151 | +//! ... |
| 152 | +//! |
| 153 | +//! # Location in which all tarballs are unpacked. Each tarball is known to |
| 154 | +//! # be frozen after downloading, so transitively this folder is also |
| 155 | +//! # frozen once its unpacked (it's never unpacked again) |
| 156 | +//! src/ |
| 157 | +//! registry1-<hash>/<pkg>-<version>/... |
| 158 | +//! ... |
| 159 | +//! ``` |
| 160 | +
|
1 | 161 | use std::io::{mod, fs, File};
|
2 | 162 | use std::io::fs::PathExtensions;
|
3 | 163 | use std::collections::HashMap;
|
@@ -28,11 +188,18 @@ pub struct RegistrySource<'a, 'b:'a> {
|
28 | 188 | handle: Option<http::Handle>,
|
29 | 189 | sources: Vec<PathSource>,
|
30 | 190 | hashes: HashMap<(String, String), String>, // (name, vers) => cksum
|
| 191 | + cache: HashMap<String, Vec<(Summary, bool)>>, |
31 | 192 | }
|
32 | 193 |
|
33 | 194 | #[deriving(Decodable)]
|
34 | 195 | pub struct RegistryConfig {
|
| 196 | + /// Download endpoint for all crates. This will be appended with |
| 197 | + /// `/<crate>/<version>/download` and then will be hit with an HTTP GET |
| 198 | + /// request to download the tarball for a crate. |
35 | 199 | pub dl: String,
|
| 200 | + |
| 201 | + /// API endpoint for the registry. This is what's actually hit to perform |
| 202 | + /// operations like yanks, owner modifications, publish new crates, etc. |
36 | 203 | pub api: String,
|
37 | 204 | }
|
38 | 205 |
|
@@ -71,6 +238,7 @@ impl<'a, 'b> RegistrySource<'a, 'b> {
|
71 | 238 | handle: None,
|
72 | 239 | sources: Vec::new(),
|
73 | 240 | hashes: HashMap::new(),
|
| 241 | + cache: HashMap::new(), |
74 | 242 | }
|
75 | 243 | }
|
76 | 244 |
|
@@ -185,6 +353,39 @@ impl<'a, 'b> RegistrySource<'a, 'b> {
|
185 | 353 | Ok(dst)
|
186 | 354 | }
|
187 | 355 |
|
| 356 | + /// Parse the on-disk metadata for the package provided |
| 357 | + fn summaries(&mut self, name: &str) -> CargoResult<&Vec<(Summary, bool)>> { |
| 358 | + if self.cache.contains_key_equiv(&name) { |
| 359 | + return Ok(self.cache.find_equiv(&name).unwrap()); |
| 360 | + } |
| 361 | + // see module comment for why this is structured the way it is |
| 362 | + let path = self.checkout_path.clone(); |
| 363 | + let path = match name.len() { |
| 364 | + 1 => path.join("1").join(name), |
| 365 | + 2 => path.join("2").join(name), |
| 366 | + 3 => path.join("3").join(name.slice_to(1)).join(name), |
| 367 | + _ => path.join(name.slice(0, 2)) |
| 368 | + .join(name.slice(2, 4)) |
| 369 | + .join(name), |
| 370 | + }; |
| 371 | + let summaries = match File::open(&path) { |
| 372 | + Ok(mut f) => { |
| 373 | + let contents = try!(f.read_to_string()); |
| 374 | + let ret: CargoResult<Vec<(Summary, bool)>>; |
| 375 | + ret = contents.as_slice().lines().filter(|l| l.trim().len() > 0) |
| 376 | + .map(|l| self.parse_registry_package(l)) |
| 377 | + .collect(); |
| 378 | + try!(ret.chain_error(|| { |
| 379 | + internal(format!("Failed to parse registry's information \ |
| 380 | + for: {}", name)) |
| 381 | + })) |
| 382 | + } |
| 383 | + Err(..) => Vec::new(), |
| 384 | + }; |
| 385 | + self.cache.insert(name.to_string(), summaries); |
| 386 | + Ok(self.cache.find_equiv(&name).unwrap()) |
| 387 | + } |
| 388 | + |
188 | 389 | /// Parse a line from the registry's index file into a Summary for a
|
189 | 390 | /// package.
|
190 | 391 | ///
|
@@ -223,32 +424,10 @@ impl<'a, 'b> RegistrySource<'a, 'b> {
|
223 | 424 |
|
224 | 425 | impl<'a, 'b> Registry for RegistrySource<'a, 'b> {
|
225 | 426 | fn query(&mut self, dep: &Dependency) -> CargoResult<Vec<Summary>> {
|
226 |
| - let name = dep.get_name(); |
227 |
| - let path = self.checkout_path.clone(); |
228 |
| - let path = match name.len() { |
229 |
| - 1 => path.join("1").join(name), |
230 |
| - 2 => path.join("2").join(name), |
231 |
| - 3 => path.join("3").join(name.slice_to(1)).join(name), |
232 |
| - _ => path.join(name.slice(0, 2)) |
233 |
| - .join(name.slice(2, 4)) |
234 |
| - .join(name), |
235 |
| - }; |
236 |
| - let contents = match File::open(&path) { |
237 |
| - Ok(mut f) => try!(f.read_to_string()), |
238 |
| - Err(..) => return Ok(Vec::new()), |
239 |
| - }; |
240 |
| - |
241 |
| - let ret: CargoResult<Vec<(Summary, bool)>>; |
242 |
| - ret = contents.as_slice().lines().filter(|l| l.trim().len() > 0) |
243 |
| - .map(|l| self.parse_registry_package(l)) |
244 |
| - .collect(); |
245 |
| - let summaries = try!(ret.chain_error(|| { |
246 |
| - internal(format!("Failed to parse registry's information for: {}", |
247 |
| - dep.get_name())) |
248 |
| - })); |
249 |
| - let mut summaries = summaries.into_iter().filter(|&(_, yanked)| { |
| 427 | + let summaries = try!(self.summaries(dep.get_name())); |
| 428 | + let mut summaries = summaries.iter().filter(|&&(_, yanked)| { |
250 | 429 | dep.get_source_id().get_precise().is_some() || !yanked
|
251 |
| - }).map(|(summary, _)| summary).collect::<Vec<_>>(); |
| 430 | + }).map(|&(ref s, _)| s.clone()).collect::<Vec<_>>(); |
252 | 431 | summaries.query(dep)
|
253 | 432 | }
|
254 | 433 | }
|
|
0 commit comments