platform_PrinterPpds: New tool for calculating a core of PPD database This patch adds a new tool written in GO called ppdTool. It calculates the minimal subset of PPD files that represents the whole PPD database. This minimal subset is ~4 times smaller than the whole database. The tool can also be used to download all PPD files from the PPD database. BUG=b:172219913 TEST=tested on my workstation Change-Id: I065b6d48bd0846647ec73ffc918170ffac45069b Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/autotest/+/2519256 Tested-by: Piotr Pawliczek <pawliczek@chromium.org> Auto-Submit: Piotr Pawliczek <pawliczek@chromium.org> Reviewed-by: Hidehiko Abe <hidehiko@chromium.org> Commit-Queue: Hidehiko Abe <hidehiko@chromium.org>

commit: 2d6aefd69b3f4db0237842379aa1f73437f46695 [log] [tgz]
author: Piotr Pawliczek <pawliczek@google.com> Wed Nov 04 10:56:15 2020 -0800
committer: Commit Bot <commit-bot@chromium.org> Wed Nov 25 20:47:01 2020 +0000
tree: 2a3c24aa441aadb6e6715fa056712d2ec714755b
parent: 4c02a443582b3a473a24d5144421251b4450a283 [diff]
diff --git a/client/site_tests/platform_PrinterPpds/ppdTool.go b/client/site_tests/platform_PrinterPpds/ppdTool.go
new file mode 100644
index 0000000..c4f4271
--- /dev/null
+++ b/client/site_tests/platform_PrinterPpds/ppdTool.go

@@ -0,0 +1,683 @@
+// Copyright 2020 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// ppdTool is a command line tool that can:
+//  * download all PPD files from the database kept on the SCS server;
+//  * cluster given set of PPD files and return a minimal subset of PPDs that
+//    represents resultant clusters. This is useful for choosing a subset of
+//    PPD files for testing.
+//
+// The tool can be run with the command:
+//    go run ppdTool.go
+// Use -h parameter to print some help and list of accepted parameters.
+//
+// The tool can be also compiled to the binary file with the following command:
+//    go build pdfTool.go
+
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"compress/gzip"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net/http"
+	"os"
+	"path/filepath"
+	"regexp"
+	"sort"
+	"strings"
+	"sync"
+)
+
+// downloadFile starts to download the content from given url with HTTP GET. It
+// returns a reader to the content. In case of an error the function terminates
+// the program.
+func downloadFile(url string) io.ReadCloser {
+	response, err := http.Get(url)
+	if err != nil {
+		log.Fatalf("Cannot HTTP GET the file %s: %s.\n", url, err)
+	}
+	if response.StatusCode != 200 {
+		response.Body.Close()
+		log.Fatalf("HTTP GET for the file %s returned status code %d.\n", url, response.StatusCode)
+	}
+	return response.Body
+}
+
+// downloadFilenamesFromPPDIndex retrieves from the index a list of all PPD
+// files. Returned PPD filenames are sorted and unique. In case of an error
+// the function terminates the program.
+func downloadFilenamesFromPPDIndex() []string {
+	const urlMetadata = "https://printerconfigurations.googleusercontent.com/chromeos_printing/metadata_v3/"
+
+	output := make(map[string]bool)
+	for i := 0; i < 20; i++ {
+		// Calculate a URL of the index file.
+		urlPPDIndex := fmt.Sprintf("%sindex-%02d.json", urlMetadata, i)
+		// Download and parse the index file.
+		respBody := downloadFile(urlPPDIndex)
+		defer respBody.Close()
+		body, err := ioutil.ReadAll(respBody)
+		if err != nil {
+			log.Fatalf("Cannot read the content of %s: %s.\n", urlPPDIndex, err)
+		}
+		// Parse the json structure and extract PPD filenames.
+		type jsonName struct {
+			Name string `json:"name"`
+		}
+		type jsonMetadata struct {
+			PPDMetadata []jsonName `json:"ppdMetadata"`
+		}
+		type jsonPrinters struct {
+			PPDIndex map[string]jsonMetadata `json:"ppdIndex"`
+		}
+		var data jsonPrinters
+		if err = json.Unmarshal(body, &data); err != nil {
+			log.Fatalf("Cannot parse the content of %s: %s.\n", urlPPDIndex, err)
+		}
+		for _, entry := range data.PPDIndex {
+			for _, element := range entry.PPDMetadata {
+				output[element.Name] = true
+			}
+		}
+	}
+
+	// Sort filenames.
+	results := make([]string, 0, len(output))
+	for filename := range output {
+		results = append(results, filename)
+	}
+	sort.Strings(results)
+
+	return results
+}
+
+// listFilenamesFromDirectory returns a list of filenames from the given
+// directory. In case of an error the function terminates the program.
+func listFilenamesFromDirectory(path string) []string {
+	files, err := ioutil.ReadDir(path)
+	if err != nil {
+		log.Fatalf("Cannot open the directory %s: %s.\n", path, err)
+	}
+	filenames := make([]string, 0, len(files))
+	for _, file := range files {
+		if !file.IsDir() {
+			filenames = append(filenames, file.Name())
+		}
+	}
+	return filenames
+}
+
+// Statement represents a single statement from a PPD file.
+type Statement struct {
+	keyword string
+	option  string
+	value   string
+}
+
+// PPD represents a content of a single PPD file as an array of Statements.
+// The field name holds the filename of the PPD file while the field
+// originalDataSize holds the initial size of the field data.
+type PPD struct {
+	name             string
+	data             []Statement
+	originalDataSize int
+}
+
+var reComment = regexp.MustCompile(`^\*[ \t]*%`)
+var reKeywordOptionValue = regexp.MustCompile(`^\*[ \t]*([^: \t]+)([ \t]+[^:]+)?[ \t]*:[ \t]*([^ \t].*)?$`)
+var reKeywordOnly = regexp.MustCompile(`^\*[ \t]*([^: \t]+)[ \t]*$`)
+var reEmptyLine = regexp.MustCompile(`^[ \t]*$`)
+
+// parseLine parses a single line from PPD file. The line is supposed to be the
+// first line of statement's definition. If the line contains white characters
+// only or is a comment the function returns empty Statement (st.keyword == "")
+// and finish with success (ok == true).
+func parseLine(line string) (st Statement, ok bool) {
+	if reComment.MatchString(line) {
+		return st, true
+	}
+	if m := reKeywordOptionValue.FindStringSubmatch(line); m != nil {
+		st.keyword = m[1]
+		st.option = m[2]
+		st.value = m[3]
+		return st, true
+	}
+	if m := reKeywordOnly.FindStringSubmatch(line); m != nil {
+		st.keyword = m[1]
+		return st, true
+	}
+	if reEmptyLine.MatchString(line) {
+		return st, true
+	}
+	return st, false
+}
+
+// ParsePPD parses a content of a PPD file. The parameter name is the filename
+// of the PPD file (the source of the content).
+func ParsePPD(name string, content []byte) (PPD, error) {
+	ppd := PPD{name: name, data: make([]Statement, 0, 512)}
+	scanner := bufio.NewScanner(bytes.NewReader(content))
+	var multilineValue = false
+	for lineNo := 1; scanner.Scan(); lineNo++ {
+		line := scanner.Text()
+		if multilineValue {
+			// We are inside a multiline value.
+			ppd.data[len(ppd.data)-1].value += "\n" + line
+			// Check for closing ".
+			multilineValue = (strings.Count(line, "\"")%2 == 0)
+			continue
+		}
+
+		st, ok := parseLine(line)
+		if !ok {
+			return ppd, fmt.Errorf("Cannot parse line %d: %s", lineNo, line)
+		}
+		if st.keyword == "" {
+			// A comment or an empty line.
+			continue
+		}
+		ppd.data = append(ppd.data, st)
+		// Check for unmatched " in the value.
+		multilineValue = (strings.Count(st.value, "\"")%2 != 0)
+	}
+	ppd.originalDataSize = len(ppd.data)
+	return ppd, scanner.Err()
+}
+
+var reWhiteSpaces = regexp.MustCompile(`[ \t]+`)
+
+// normalizeSpacesAndTabs normalizes subsequences of spaces and tabulators in
+// the given string. All leading and trailing spaces and tabs are removed.
+// Every subsequence consisting of spaces and tabulators is replaced by a
+// single space.
+func normalizeSpacesAndTabs(str *string) {
+	*str = strings.TrimSpace(*str)
+	*str = reWhiteSpaces.ReplaceAllString(*str, " ")
+}
+
+var keywordsToRemove = map[string]bool{
+	"1284DeviceID":           true,
+	"cupsLanguages":          true,
+	"cupsVersion":            true,
+	"DefaultDocCutType":      true,
+	"DefaultInstalledMemory": true,
+	"DefaultPageCutType":     true,
+	"DocCutType":             true,
+	"driverUrl":              true,
+	"End":                    true,
+	"FileVersion":            true,
+	"FoomaticIDs":            true,
+	"InstalledMemory":        true,
+	"Manufacturer":           true,
+	"ModelName":              true,
+	"NickName":               true,
+	"PageCutType":            true,
+	"PCFileName":             true,
+	"Product":                true,
+	"ShortNickName":          true,
+	"Throughput":             true}
+
+var shortLang = regexp.MustCompile(`^[a-z][a-z]\.`)
+var longLang = regexp.MustCompile(`^[a-z][a-z]_[A-Za-z][A-Za-z]\.`)
+
+// normalizePPD processes the given PPD content to make it suitable for
+// comparison with other PPDs. The PPD may be no longer valid after this
+// transformation. The following operations are performed on the PPD:
+// * all statements with keyword included in the global variable
+//   keywordsToRemove are removed;
+// * all statements with keyword with prefix matching ^[a-z][a-z]\. or
+//   ^[a-z][a-z]_[A-Za-z][A-Za-z]\. are removed (like *pl.MediaType,
+//	 *de.Translation, *fr_CA.Translation, *zh_TW.MediaType, etc.);
+// * subsequences of white spaces in all statements are normalized with
+//   the use of normalizeSpacesAndTabs(...)
+func normalizePPD(ppd *PPD) {
+	newData := make([]Statement, 0, len(ppd.data))
+	for _, s := range ppd.data {
+		if keywordsToRemove[s.keyword] {
+			continue
+		}
+		if shortLang.MatchString(s.keyword) || longLang.MatchString(s.keyword) {
+			continue
+		}
+		normalizeSpacesAndTabs(&s.option)
+		normalizeSpacesAndTabs(&s.value)
+		newData = append(newData, s)
+	}
+	ppd.data = newData
+}
+
+// parseAndNormalizePPDFile reads the content of a PPD file from the given
+// reader and parses it. The content is also normalized with the normalizePPD
+// function. In case of an error the function terminates the program.
+func parseAndNormalizePPDFile(reader io.ReadCloser, filename string) PPD {
+	// Decompress the content if needed.
+	if strings.HasSuffix(filename, ".gz") {
+		defer reader.Close()
+		decomp, err := gzip.NewReader(reader)
+		if err != nil {
+			log.Fatalf("Error when decompressing the file %s: %s.\n", filename, err)
+		}
+		reader = decomp
+	}
+	defer reader.Close()
+	content, err := ioutil.ReadAll(reader)
+	if err != nil {
+		log.Fatalf("Error when reading a content of the file %s: %s.\n", filename, err)
+	}
+	ppd, err := ParsePPD(filename, content)
+	if err != nil {
+		log.Fatalf("Error when parsing a content of the file %s: %s.\n", filename, err)
+	}
+	normalizePPD(&ppd)
+	return ppd
+}
+
+// checkNotExists terminates the program when the given path exists.
+func checkNotExists(path string) {
+	_, err := os.Stat(path)
+	if os.IsNotExist(err) {
+		return
+	}
+	if err == nil {
+		log.Fatal("File or directory '" + path + "' already exists.")
+	}
+	log.Fatalf("Cannot access '%s': %s.\n", path, err)
+}
+
+// divideIntoLargeClusters divides the input set of PPDs into clusters of PPDs
+// with the same content (data). The output slice contains the resultant
+// clusters saved as a list of PPD names.
+func divideIntoLargeClusters(ppds []PPD) [][]string {
+	type ppdTypeDefinition struct {
+		cupsFilter             string
+		cupsModelNumber        string
+		cupsPreFilter          string
+		driverName             string
+		driverType             string
+		foomaticRIPCommandLine string
+	}
+
+	groups := make(map[ppdTypeDefinition][]int)
+	for iPPD, ppd := range ppds {
+		chosenKeywords := make(map[string][]string)
+		for _, st := range ppd.data {
+			switch st.keyword {
+			case "cupsFilter", "cupsFilter2", "cupsModelNumber", "cupsPreFilter", "FoomaticRIPCommandLine":
+				chosenKeywords[st.keyword] = append(chosenKeywords[st.keyword], st.value)
+			case "driverName", "driverType":
+				chosenKeywords[st.keyword] = append(chosenKeywords[st.keyword], st.option)
+			}
+		}
+		if values, ok := chosenKeywords["cupsFilter2"]; ok {
+			chosenKeywords["cupsFilter"] = values
+			delete(chosenKeywords, "cupsFilter2")
+		}
+		var hash ppdTypeDefinition
+		for keyword, values := range chosenKeywords {
+			sort.Slice(values, func(i, j int) bool { return values[i] < values[j] })
+			switch keyword {
+			case "cupsFilter":
+				hash.cupsFilter = strings.Join(values, " | ")
+			case "cupsModelNumber":
+				hash.cupsModelNumber = strings.Join(values, " | ")
+			case "cupsPreFilter":
+				hash.cupsPreFilter = strings.Join(values, " | ")
+			case "driverName":
+				hash.driverName = strings.Join(values, " | ")
+			case "driverType":
+				hash.driverType = strings.Join(values, " | ")
+			case "FoomaticRIPCommandLine":
+				hash.foomaticRIPCommandLine = strings.Join(values, " | ")
+			}
+		}
+		groups[hash] = append(groups[hash], iPPD)
+	}
+
+	// Sort every group by originalDataSize(decreasing), name(alphabetically).
+	for _, ppdIDs := range groups {
+		sort.Slice(ppdIDs, func(i, j int) bool {
+			p1 := ppdIDs[i]
+			p2 := ppdIDs[j]
+			if ppds[p1].originalDataSize == ppds[p2].originalDataSize {
+				return ppds[p1].name < ppds[p2].name
+			}
+			return ppds[p1].originalDataSize > ppds[p2].originalDataSize
+		})
+	}
+
+	// Convert groups to a slice of slices with names.
+	groupsSlice := make([][]string, 0, len(groups))
+	for _, group := range groups {
+		names := make([]string, len(group))
+		for i, iPPD := range group {
+			names[i] = ppds[iPPD].name
+		}
+		groupsSlice = append(groupsSlice, names)
+	}
+
+	sort.Slice(groupsSlice, func(i, j int) bool {
+		return groupsSlice[i][0] < groupsSlice[j][0]
+	})
+
+	return groupsSlice
+}
+
+// compareSameSizePPDs is a helper function for divideIntoSmallClusters. It
+// divides the set of PPDs into clusters of PPDs with the same data. The input
+// PPDs must have the same size of data field. The function returns resultant
+// clusters as slices with PPDs names.
+func compareSameSizePPDs(ppds []PPD) [][]string {
+	// This map holds PPDID->groupID. At the beginning, every PPD is assigned
+	// to a one-element group.
+	ppdsGroups := make([]int, len(ppds))
+	for i := range ppdsGroups {
+		ppdsGroups[i] = i
+	}
+
+	// Find PPDs with the same data and assign them to the same group.
+	for i1, e1 := range ppds {
+		if ppdsGroups[i1] != i1 {
+			// This PPD was already assigned.
+			continue
+		}
+		for i2 := i1 + 1; i2 < len(ppds); i2++ {
+			e2 := ppds[i2]
+			if ppdsGroups[i2] != i2 {
+				// This PPD was already assigned.
+				continue
+			}
+			// Compare data.
+			match := true
+			for ip, s1 := range e1.data {
+				s2 := e2.data[ip]
+				if s1 != s2 {
+					match = false
+					break
+				}
+			}
+			if match {
+				// Assign i2 to the same group as i1.
+				ppdsGroups[i2] = i1
+			}
+		}
+	}
+
+	// This map contains groupID->[]PPDID.
+	groups := make(map[int][]int)
+	for iPPD, iGroup := range ppdsGroups {
+		groups[iGroup] = append(groups[iGroup], iPPD)
+	}
+	// Sort every group by originalDataSize(decreasing), name(alphabetically).
+	for _, ppdIDs := range groups {
+		sort.Slice(ppdIDs, func(i, j int) bool {
+			p1 := ppdIDs[i]
+			p2 := ppdIDs[j]
+			if ppds[p1].originalDataSize == ppds[p2].originalDataSize {
+				return ppds[p1].name < ppds[p2].name
+			}
+			return ppds[p1].originalDataSize > ppds[p2].originalDataSize
+		})
+	}
+
+	// Convert groups to a slice of slices with names.
+	groupsSlice := make([][]string, 0, len(groups))
+	for _, group := range groups {
+		names := make([]string, len(group))
+		for i, iPPD := range group {
+			names[i] = ppds[iPPD].name
+		}
+		groupsSlice = append(groupsSlice, names)
+	}
+
+	return groupsSlice
+}
+
+// divideIntoSmallClusters divides the input set of PPDs into clusters of PPDs
+// with the same content (data). The output slice contains the resultant
+// clusters saved as a list of PPD names.
+func divideIntoSmallClusters(ppds []PPD) [][]string {
+
+	type ppdHash struct {
+		dataSize        int
+		firstStatement  Statement
+		middleStatement Statement
+		lastStatement   Statement
+	}
+
+	ppdsByHash := make(map[ppdHash][]PPD)
+	for _, ppd := range ppds {
+		var hash ppdHash
+		hash.dataSize = len(ppd.data)
+		hash.firstStatement = ppd.data[0]
+		hash.middleStatement = ppd.data[len(ppd.data)/2]
+		hash.lastStatement = ppd.data[len(ppd.data)-1]
+		ppdsByHash[hash] = append(ppdsByHash[hash], ppd)
+	}
+
+	chGroups := make(chan [][]string, len(ppdsByHash))
+	for _, ppdsToCompare := range ppdsByHash {
+		go func(ppdsToCompare []PPD) {
+			chGroups <- compareSameSizePPDs(ppdsToCompare)
+		}(ppdsToCompare)
+	}
+	var groups [][]string
+	for range ppdsByHash {
+		groups = append(groups, <-chGroups...)
+	}
+	close(chGroups)
+
+	sort.Slice(groups, func(i, j int) bool {
+		return groups[i][0] < groups[j][0]
+	})
+
+	return groups
+}
+
+// saveClustersToFile creates a new file at given path and saves there the
+// given list of clusters. In case of any error the function terminates the
+// program.
+func saveClustersToFile(clusters [][]string, path string) {
+	file, err := os.Create(path)
+	if err != nil {
+		log.Fatalf("Cannot create a file %s: %s.\n", path, err)
+	}
+	defer file.Close()
+	for _, cluster := range clusters {
+		file.WriteString(strings.Join(cluster, "\t"))
+		file.WriteString("\n")
+	}
+}
+
+// createDirectoryWithPPDs creates directory given in the parameter pathTrg and
+// copies there the given set of files from the directory defined in pathSrc.
+// In case of any error the function terminates the program.
+func createDirectoryWithPPDs(pathSrc string, filenames []string, pathTrg string) {
+	if err := os.MkdirAll(pathTrg, 0755); err != nil {
+		log.Fatalf("Cannot create a directory '%s': %s.\n", pathTrg, err)
+	}
+	for _, filename := range filenames {
+		src := filepath.Join(pathSrc, filename)
+		trg := filepath.Join(pathTrg, filename)
+		if err := os.Link(src, trg); err != nil {
+			log.Fatalf("Cannot create a hard link %s for the file %s: %s.\n", trg, src, err)
+		}
+	}
+}
+
+func commandCompare(args []string) {
+	const filenameLargeClusters = "large_clusters.txt"
+	const filenameSmallClusters = "small_clusters.txt"
+	const dirnameCorePPDs = "core_ppds"
+	const dirnameExtPPDs = "ext_ppds"
+
+	flags := flag.NewFlagSet("compare", flag.ExitOnError)
+	flagInput := flags.String("input", "ppds_all", "Directory with PPD files.")
+	flagOutput := flags.String("output", ".", "Directory to save results. It is created if not exists.")
+	flags.Parse(args)
+
+	if len(flags.Args()) > 0 {
+		log.Fatal("Unknown parameter. Run with -h or --help to see the list of supported parameters.")
+	}
+
+	pathLargeClusters := filepath.Join(*flagOutput, filenameLargeClusters)
+	pathSmallClusters := filepath.Join(*flagOutput, filenameSmallClusters)
+	pathCorePPDs := filepath.Join(*flagOutput, dirnameCorePPDs)
+	pathExtPPDs := filepath.Join(*flagOutput, dirnameExtPPDs)
+
+	checkNotExists(pathLargeClusters)
+	checkNotExists(pathSmallClusters)
+	checkNotExists(pathCorePPDs)
+	checkNotExists(pathExtPPDs)
+
+	fmt.Println("Reading a list of PPD files from the directory...")
+	filenames := listFilenamesFromDirectory(*flagInput)
+	fmt.Printf("Found %d files.\n", len(filenames))
+
+	fmt.Println("Processing all files...")
+	ppds := make([]PPD, len(filenames))
+	var wg sync.WaitGroup
+	for i, filename := range filenames {
+		wg.Add(1)
+		go func(i int, filename string) {
+			defer wg.Done()
+			path := filepath.Join(*flagInput, filename)
+			reader, err := os.Open(path)
+			if err != nil {
+				log.Fatalf("Cannot open the file %s: %s.\n", path, err)
+			}
+			ppds[i] = parseAndNormalizePPDFile(reader, filename)
+		}(i, filename)
+	}
+	wg.Wait()
+	fmt.Println("Done.")
+
+	fmt.Println("Calculating small clusters...")
+	groupsSmall := divideIntoSmallClusters(ppds)
+	fmt.Printf("Done. The number of small clusters: %d.\n", len(groupsSmall))
+
+	fmt.Println("Calculating large clusters...")
+	groupsLarge := divideIntoLargeClusters(ppds)
+	fmt.Printf("Done. The number of large clusters: %d.\n", len(groupsLarge))
+
+	filenamesCore := make([]string, 0, len(groupsLarge))
+	setFilenameCore := make(map[string]bool)
+	for _, group := range groupsLarge {
+		filenamesCore = append(filenamesCore, group[0])
+		setFilenameCore[group[0]] = true
+	}
+	filenamesExt := make([]string, 0, len(groupsSmall))
+	for _, group := range groupsSmall {
+		if !setFilenameCore[group[0]] {
+			filenamesExt = append(filenamesExt, group[0])
+		}
+	}
+
+	// Save results.
+	createDirectoryWithPPDs(*flagInput, filenamesCore, pathCorePPDs)
+	createDirectoryWithPPDs(*flagInput, filenamesExt, pathExtPPDs)
+	saveClustersToFile(groupsSmall, pathSmallClusters)
+	saveClustersToFile(groupsLarge, pathLargeClusters)
+}
+
+func commandDownload(args []string) {
+	const urlPPD = "https://printerconfigurations.googleusercontent.com/chromeos_printing/ppds_for_metadata_v3/"
+	const maxNumberOfParallelDownloads = 4
+
+	flags := flag.NewFlagSet("download", flag.ExitOnError)
+	flagOutput := flags.String("output", "ppds_all", "Directory to save PPD files, it cannot exist.")
+	flags.Parse(args)
+
+	if len(flags.Args()) > 0 {
+		log.Fatal("Unknown parameter. Run with -h or --help to see the list of supported parameters.")
+	}
+	checkNotExists(*flagOutput)
+	if err := os.MkdirAll(*flagOutput, 0755); err != nil {
+		log.Fatalf("Cannot create a directory '%s': %s.\n", *flagOutput, err)
+	}
+
+	fmt.Println("Downloading a list of PPD files from the index...")
+	filenames := downloadFilenamesFromPPDIndex()
+	fmt.Printf("Found %d files.\n", len(filenames))
+
+	fmt.Println("Downloading PPD files...")
+	chFilenames := make(chan string)
+	var wgEnd sync.WaitGroup
+	for i := 0; i < maxNumberOfParallelDownloads; i++ {
+		wgEnd.Add(1)
+		go func() {
+			defer wgEnd.Done()
+			for filename := range chFilenames {
+				reader := downloadFile(urlPPD + filename)
+				path := filepath.Join(*flagOutput, filename)
+				file, err := os.Create(path)
+				if err != nil {
+					log.Fatalf("Cannot create file %s on the disk: %s.\n", path, err)
+				}
+				if _, err = io.Copy(file, reader); err != nil {
+					log.Fatalf("Cannot copy the content of the file %s: %s.\n", path, err)
+				}
+				reader.Close()
+				file.Close()
+			}
+		}()
+	}
+	for _, filename := range filenames {
+		chFilenames <- filename
+	}
+	close(chFilenames)
+	wgEnd.Wait()
+	fmt.Println("Done")
+}
+
+const usageText = `
+The first parameter must be one of the following commands:
+
+  download - downloads all PPDs from the index to the given directory.
+
+  compare - perform two independent clusterizations on the given set of PPD
+      files. Two sets of clusters are calculated:
+       * a set of large clusters where PPD are grouped together by pipeline
+           types;
+       * a set of small clusters where PPD are grouped together by their
+           similarity.
+      For both results a minimal subsets of representative PPDs are calculated.
+      In the output directory, the following files and directories are created:
+       * large_clusters.txt - a file with PPD names grouped in large clusters
+       * small_clusters.txt - a file with PPD names grouped in small clusters
+       * core_ppds - a directory with hard links to PPD files representing
+           large clusters, each cluster is represented by exactly one PPD file.
+           For the full PPD dataset given on the input, this directory is
+           supposed to have around ~100 PPD files;
+       * ext_ppds - a directory with hard links to PPD files representing
+           small clusters, each cluster is represented by exactly one PPD file.
+           IF A PPD FILE IS ALREADY PRESENT IN core_ppds IT IS OMITTED. For the
+           full PPD dataset given on the input, this directory is supposed to
+           have around ~1500 PPD files minus ~100 PPD files already present in
+           the core_ppd directory.
+
+Run one of the commands with '-h' or '--help' to get a list of parameters.
+`
+
+func main() {
+	if len(os.Args) < 2 {
+		fmt.Println(usageText)
+		return
+	}
+
+	switch os.Args[1] {
+	case "compare":
+		commandCompare(os.Args[2:])
+	case "download":
+		commandDownload(os.Args[2:])
+	default:
+		fmt.Println(usageText)
+	}
+}
commit	2d6aefd69b3f4db0237842379aa1f73437f46695	[log] [tgz]
author	Piotr Pawliczek <pawliczek@google.com>	Wed Nov 04 10:56:15 2020 -0800
committer	Commit Bot <commit-bot@chromium.org>	Wed Nov 25 20:47:01 2020 +0000
tree	2a3c24aa441aadb6e6715fa056712d2ec714755b
parent	4c02a443582b3a473a24d5144421251b4450a283 [diff]