blob: 4df4438812545f146c43512272cc960b53b10443 [file] [log] [blame] [edit]
// Copyright 2021 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This file is mostly copied from chromium repo:
// //components/assist_ranker/example_preprocessing.h
#ifndef ML_EXAMPLE_PREPROCESSOR_EXAMPLE_PREPROCESSING_H_
#define ML_EXAMPLE_PREPROCESSOR_EXAMPLE_PREPROCESSING_H_
#include <string>
#include <google/protobuf/map.h>
#include "chrome/knowledge/assist_ranker/example_preprocessor.pb.h"
#include "chrome/knowledge/assist_ranker/ranker_example.pb.h"
namespace assist_ranker {
// Preprocessor for preprocessing RankerExample into formats that is needed by
// Ranker Predictors.
class ExamplePreprocessor {
public:
// Error code (bitwise) for preprocessing.
enum PreprocessErrorCode {
kSuccess = 0,
kNoFeatureIndexFound = 1,
kNonbucketizableFeatureType = 2,
kInvalidFeatureType = 4,
kInvalidFeatureListIndex = 8,
kNonNormalizableFeatureType = 16,
kNonConvertibleToStringFeatureType = 32,
kNormalizerIsZero = 64,
};
// Processes a RankerExample with config.
// Clear up all features except kVectorizedFeatureDefaultName if
// clear_other_features is set to true.
// Returns the error code of preprocessing, can be any sum of the error code
// in PreprocessErrorCode.
static int Process(const ExamplePreprocessorConfig& config,
RankerExample* example,
bool clear_other_features = false);
// Default feature name for missing features.
static const char kMissingFeatureDefaultName[];
// Default feature name for vectorized features.
static const char kVectorizedFeatureDefaultName[];
// Generates a feature's fullname based on feature_name and feature_value.
// A feature fullname is defined as:
// (1) feature_name if it's bool_value, int64_value or float_value.
// (2) a combination of feature_name and feature_value if it's string_value
// or i-th element of a string_list.
static std::string FeatureFullname(const std::string& feature_name,
const std::string& feature_value = "");
private:
// If a feature is specified in config.missing_features() and missing in
// the example, then the feature name is added as a sparse feature value to
// the special sparse feature "_MissingFeature" in the example.
// Always returns kSuccess.
static int AddMissingFeatures(const ExamplePreprocessorConfig& config,
RankerExample* example);
// If a numeric feature is specified in config.bucketizers(), then it is
// bucketized based on the boundaries and reset as a one-hot feature with
// bucket index as it's string value.
static int AddBucketizedFeatures(const ExamplePreprocessorConfig& config,
RankerExample* example);
// Normalizes numeric features to be within [-1.0, 1.0] as float features.
static int NormalizeFeatures(const ExamplePreprocessorConfig& config,
RankerExample* example);
// Converts any features in |example| that are listed in
// |config.convert_to_string_features()| into string-valued features.
static int ConvertToStringFeatures(const ExamplePreprocessorConfig& config,
RankerExample* example);
// Add a new_float_list feature as kVectorizedFeatureDefaultName, and iterate
// for all existing features in example.features(), set corresponding
// new_float_list.float_value(config.feature_indices(feature_value_key)) to
// be either numeric value (for scalars) or 1.0 (for string values).
static int Vectorization(const ExamplePreprocessorConfig& config,
RankerExample* example,
bool clear_other_features);
};
// An iterator that goes through all features of a RankerExample and converts
// each field as a struct Field{full_name, value, error}.
// (1) A numeric feature (bool_value, int32_value, float_value) is converted
// to {feature_name, float(original_value), kSuccess}.
// (2) A string feature is converted to
// {feature_name_string_value, 1.0, kSuccess}.
// (3) A string_value from a string list feature is converted to
// {feature_name_string_value, 1.0, error_code} where non-empty list
// gets error_code kSuccess, empty list gets kInvalidFeatureListIndex.
// Example:
// std::vector<float> ExampleToStdFloat(const RankerExample& example,
// const Map& feature_indices) {
// std::vector<float> vectorized(feature_indices.size());
// for (const auto& field : ExampleFloatIterator(example)) {
// if (field.error == ExamplePreprocessor::kSuccess) {
// const int index = feature_indices[field.fullname];
// vectorized[index] = field.value;
// }
// }
// return vectorized;
// }
class ExampleFloatIterator {
public:
// A struct as float value of one field from a RankerExample.
struct Field {
std::string fullname;
float value;
int error;
};
explicit ExampleFloatIterator(const RankerExample& example)
: feature_iterator_(example.features().begin()),
feature_end_iterator_(example.features().end()),
string_list_index_(0) {}
ExampleFloatIterator begin() const { return *this; }
ExampleFloatIterator end() const {
return ExampleFloatIterator(feature_end_iterator_);
}
Field operator*() const;
ExampleFloatIterator& operator++();
// Two iterators are equal if they point to the same field, with the same
// indices if it's a string_list.
bool operator==(const ExampleFloatIterator& other) const {
return feature_iterator_ == other.feature_iterator_ &&
string_list_index_ == other.string_list_index_;
}
bool operator!=(const ExampleFloatIterator& other) const {
return !(*this == other);
}
private:
// Returns the end iterator.
explicit ExampleFloatIterator(
const google::protobuf::Map<std::string, Feature>::const_iterator&
feature_end_iterator)
: feature_iterator_(feature_end_iterator),
feature_end_iterator_(feature_end_iterator),
string_list_index_(0) {}
google::protobuf::Map<std::string, Feature>::const_iterator feature_iterator_;
google::protobuf::Map<std::string, Feature>::const_iterator
feature_end_iterator_;
int string_list_index_;
};
} // namespace assist_ranker
#endif // ML_EXAMPLE_PREPROCESSOR_EXAMPLE_PREPROCESSING_H_