diff --git a/.gitignore b/.gitignore index 34674a541..f60de9185 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ build/ res/ vendor/ CMakeCache.txt +CMakeSettings.json CMakeFiles CMakeScripts cmake_install.cmake @@ -24,6 +25,7 @@ CMakeLists.txt.user *.err .vscode/c_cpp_properties.json .vscode/ipch +.vs/ tests/output tests/env tests/.pytest_cache diff --git a/cmake/targets/libmzn.cmake b/cmake/targets/libmzn.cmake index abf9d934c..4464ae39b 100644 --- a/cmake/targets/libmzn.cmake +++ b/cmake/targets/libmzn.cmake @@ -18,6 +18,7 @@ add_library(mzn lib/copy.cpp lib/exception.cpp lib/eval_par.cpp + lib/feature_extraction.cpp lib/file_utils.cpp lib/flatten.cpp lib/flatten/flat_exp.cpp @@ -81,6 +82,7 @@ add_library(mzn include/minizinc/copy.hh include/minizinc/eval_par.hh include/minizinc/exception.hh + include/minizinc/feature_extraction.hh include/minizinc/file_utils.hh include/minizinc/flat_exp.hh include/minizinc/flatten.hh diff --git a/docs/en/command_line.rst b/docs/en/command_line.rst index 42759e0b0..e63169beb 100644 --- a/docs/en/command_line.rst +++ b/docs/en/command_line.rst @@ -147,6 +147,13 @@ These options control the general behaviour of the ``minizinc`` tool. Print statistics for compilation. +.. option:: --feature-vector, --feature-vector <(\\d*v)?(\\d*c)?(f)?> + + Extract and print feature vector of the FlatZinc model. + Allows configuration of the feature-extraction process. + [0-9]+v limits variables in constraint graph, [0-9]+c limits constraints in constraint graph, f ignores floats. + By default uses ""0v0cf"" which will not apply padding / cropping to the constraint graph and will ignore floats. + .. option:: -c, --compile Compile only (do not run solver). diff --git a/include/minizinc/feature_extraction.hh b/include/minizinc/feature_extraction.hh new file mode 100644 index 000000000..ab26f114b --- /dev/null +++ b/include/minizinc/feature_extraction.hh @@ -0,0 +1,110 @@ +#pragma once +#include +#include + +namespace MiniZinc { + +static std::regex featureVectorOptionsRegex("(\\d*v)?(\\d*c)?(f)?", std::regex_constants::ECMAScript | + std::regex_constants::icase); + +/// Feature Vector of the FlatModel +struct FlatModelFeatureVector { + /// Number of integer variables + int n_int_vars; // NOLINT(readability-identifier-naming) + /// Number of bool variables + int n_bool_vars; // NOLINT(readability-identifier-naming) + /// Number of set variables + int n_set_vars; // NOLINT(readability-identifier-naming) + /// Number of bool constraints + int n_bool_ct; // NOLINT(readability-identifier-naming) + /// Number of integer constraints + int n_int_ct; // NOLINT(readability-identifier-naming) + /// Number of set constraints + int n_set_ct; // NOLINT(readability-identifier-naming) + + double std_dev_domain_size; + double avg_domain_size; + double median_domain_size; + double avg_domain_overlap; + double avg_decision_vars_in_cts; + int n_disjoint_domain_pairs; + int n_meta_ct; + int n_total_ct; + std::string constraint_graph; + std::map ct_histogram; + std::map ann_histogram; + std::vector domain_widths; // indices in this array match keys in customIdToVarNameMap + + // mainly for debugging purposes + // it's important that the IDs are consecutive [0, 1, ..] , so we can not use idn() here + // example: we want the first variable defined in the file to have id 0, the second one 1 ... + // same goes for constraints, the first constraint gets id 0 + std::map customIdToVarNameMap; + std::map customIdToConstraintNameMap; + + + /// Constructor + FlatModelFeatureVector() + : n_int_vars(0), + n_bool_vars(0), + n_set_vars(0), + n_bool_ct(0), + n_int_ct(0), + n_set_ct(0), + std_dev_domain_size(0), + avg_domain_size(0), + median_domain_size(0), + avg_domain_overlap(0), + avg_decision_vars_in_cts(0), + n_disjoint_domain_pairs(0), + n_meta_ct(0), + n_total_ct(0), + constraint_graph(""), + customIdToVarNameMap(), + customIdToConstraintNameMap(), + ct_histogram(), + ann_histogram(), + domain_widths() + {} + + struct Options { + int vDimensions = -1; // for making uniform dimensions of the constraint graph. -1 will not apply padding / cropping + int cDimensions = -1; + bool ignoreFloats = true; // decide whether to ignore floats in all features + + static Options parse_from_string(const std::string input) { + Options opts; + + std::smatch match; + std::regex_match(input, match, featureVectorOptionsRegex); + std::string vars_limit = match[1].str(); + std::string constraints_limit = match[2].str(); + + auto vpos = vars_limit.find_last_of('v'); + auto cpos = constraints_limit.find_last_of('c'); + + if (vpos != std::string::npos) { + vars_limit = vars_limit.erase(vpos, std::string::npos); + } + if (cpos != std::string::npos) { + constraints_limit = constraints_limit.erase(cpos, std::string::npos); + } + + opts.vDimensions = atoi(vars_limit.c_str()); + opts.cDimensions = atoi(constraints_limit.c_str()); + opts.ignoreFloats = match[3].matched; + + return opts; + } + + static bool is_valid_options_regex(const std::string input) { + return std::regex_match(input, featureVectorOptionsRegex); + } + }; +}; + +/// Extract the features for flat model in \a m +FlatModelFeatureVector extract_feature_vector(Env& m, FlatModelFeatureVector::Options& o); + +} + diff --git a/include/minizinc/flatten.hh b/include/minizinc/flatten.hh index 8aa1f1e40..b31a12587 100644 --- a/include/minizinc/flatten.hh +++ b/include/minizinc/flatten.hh @@ -150,6 +150,14 @@ struct FlatModelStatistics { int n_imp_del; // NOLINT(readability-identifier-naming) /// Number of linear expressions eliminated using path compression int n_lin_del; // NOLINT(readability-identifier-naming) + + double* std_dev_domain_size; + double* avg_domain_size; + double* median_domain_size; + double* avg_domain_overlap; + int* n_disjoint_domain_pairs; + int n_total_ct; + /// Constructor FlatModelStatistics() : n_int_vars(0), @@ -163,7 +171,13 @@ struct FlatModelStatistics { n_reif_ct(0), n_imp_ct(0), n_imp_del(0), - n_lin_del(0) {} + n_lin_del(0), + std_dev_domain_size(nullptr), // nullptrs instead 0, because 0 is within the valid range of values + avg_domain_size(nullptr), + median_domain_size(nullptr), + avg_domain_overlap(nullptr), + n_disjoint_domain_pairs(nullptr), + n_total_ct(0) {} }; /// Compute statistics for flat model in \a m diff --git a/include/minizinc/flattener.hh b/include/minizinc/flattener.hh index a67d4e048..a82f3f23e 100644 --- a/include/minizinc/flattener.hh +++ b/include/minizinc/flattener.hh @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -65,6 +66,10 @@ public: bool getFlagVerbose() const { return _flags.verbose; } void setFlagStatistics(bool f) { _flags.statistics = f; } bool getFlagStatistics() const { return _flags.statistics; } + void setFlagFeatureVector(FlatModelFeatureVector::Options* options) { + _flags.featureVector = options; + } + FlatModelFeatureVector::Options* getFlagFeatureVector() { return _flags.featureVector; } void setFlagEncapsulateJSON(bool f) { _flags.encapsulateJSON = f; } bool getFlagEncapsulateJSON() const { return _flags.encapsulateJSON; } void setRandomSeed(long unsigned int r) { _fopts.randomSeed = r; } @@ -102,6 +107,7 @@ private: bool allowUnboundedVars = false; bool noMIPdomains = false; bool statistics = false; + FlatModelFeatureVector::Options* featureVector = nullptr; bool stdinInput = false; bool allowMultiAssign = false; bool gecode = false; diff --git a/include/minizinc/solver.hh b/include/minizinc/solver.hh index fe86377f9..a8e91c54a 100644 --- a/include/minizinc/solver.hh +++ b/include/minizinc/solver.hh @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -22,6 +23,7 @@ #include #include + namespace MiniZinc { class SolverInitialiser { @@ -164,6 +166,8 @@ public: bool flagStatistics = false; bool flagCompilerVerbose = false; bool flagCompilerStatistics = false; + bool flagFeatureVector = false; + FlatModelFeatureVector::Options featureVectorOptions; bool flagEncapsulateJSON = false; bool flagIsSolns2out = false; std::chrono::milliseconds flagOverallTimeLimit = std::chrono::milliseconds(0); diff --git a/include/minizinc/statistics.hh b/include/minizinc/statistics.hh index f10e638f3..ef3dc51a2 100644 --- a/include/minizinc/statistics.hh +++ b/include/minizinc/statistics.hh @@ -12,6 +12,7 @@ #pragma once #include +#include namespace MiniZinc { @@ -22,6 +23,18 @@ private: bool _json; bool _first = true; std::ios _ios; + std::string _jsonType; + std::string _prefix; + std::string _endMarker; + + template + void serializeValue(std::ostream& os, const T& value) { + if constexpr (std::is_arithmetic::value) { + os << value; + } else { + os << "\"" << value << "\""; + } + } template void addInternal(const std::string& stat, const T& value) { @@ -33,12 +46,76 @@ private: } _os << "\"" << Printer::escapeStringLit(stat) << "\": " << value; } else { - _os << "%%%mzn-stat: " << stat << "=" << value << "\n"; + _os << _prefix << stat << "=" << value << "\n"; + } + } + + template + void addArrayInternal(const std::string& stat, const std::vector& value) { + if (_json) { + if (_first) { + _first = false; + } else { + _os << ", "; + } + _os << "\"" << Printer::escapeStringLit(stat) << "\": ["; + for (size_t i = 0; i < value.size(); ++i) { + if (i > 0) { + _os << ", "; + } + serializeValue(_os, value[i]); + } + _os << "]"; + } else { + _os << _prefix << stat << "=["; + for (size_t i = 0; i < value.size(); ++i) { + if (i > 0) { + _os << ", "; + } + _os << value[i]; + } + _os << "]\n"; + } + } + + template + void addMapInternal(const std::string& stat, const std::map& value) { + if (_json) { + if (_first) { + _first = false; + } else { + _os << ", "; + } + _os << "\"" << Printer::escapeStringLit(stat) << "\": {"; + bool firstElem = true; + for (const auto& pair : value) { + if (!firstElem) { + _os << ", "; + } + firstElem = false; + _os << "\"" << pair.first << "\": "; + serializeValue(_os, pair.second); + } + _os << "}"; + } else { + _os << _prefix << stat << "={"; + bool firstElem = true; + for (const auto& pair : value) { + if (!firstElem) { + _os << ", "; + } + firstElem = false; + _os << pair.first << ": " << pair.second; + } + _os << "}\n"; } } public: - StatisticsStream(std::ostream& os, bool json = false); + StatisticsStream(std::ostream& os, bool json = false, + std::string jsonType = "statistics", + std::string linePrefix = "%%%mzn-stat: ", + std::string outputEndMarker = "%%%mzn-stat-end"); ~StatisticsStream(); void precision(std::streamsize prec, bool fixed = false); @@ -52,6 +129,16 @@ public: void add(const std::string& stat, double value); void add(const std::string& stat, const std::string& value); void addRaw(const std::string& stat, const std::string& value); + + template + void addArray(const std::string& stat, const std::vector& value) { + addArrayInternal(stat, value); + } + + template + void addMap(const std::string& stat, const std::map& value) { + addMapInternal(stat, value); + } }; class Statistics { diff --git a/include/minizinc/utils.hh b/include/minizinc/utils.hh index a10606441..7277e4088 100644 --- a/include/minizinc/utils.hh +++ b/include/minizinc/utils.hh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,7 @@ #include #include #include +#include #ifdef MZN_HAS_LLROUND #include @@ -213,6 +215,46 @@ inline void vec_string2vec_pchar(const std::vector& vS, } } +// computes the average without running the risk of overflows +template +inline double mean(const std::vector& numbers) { + static_assert(std::is_arithmetic::value, "Template argument must be a numeric type."); + + if (numbers.empty()) { + return 0; + } + + int s = numbers.size(); + double avg = 0.0; + + for (T n : numbers) { + avg += static_cast(n) / s; + } + + return avg; +} + +// computes the standard deviation of a vector +template +inline double stdDev(const std::vector& numbers) { + static_assert(std::is_arithmetic::value, "Template argument must be a numeric type."); + + if (numbers.empty()) { + return 0; + } + + double m = mean(numbers); + + double sumSquaredDiffs = 0.0; + for (T n : numbers) { + double diff = static_cast(n) - m; + sumSquaredDiffs += diff * diff; + } + + double variance = sumSquaredDiffs / numbers.size(); + return std::sqrt(variance); +} + class Env; class OverflowHandler { diff --git a/lib/feature_extraction.cpp b/lib/feature_extraction.cpp new file mode 100644 index 000000000..b66691ae0 --- /dev/null +++ b/lib/feature_extraction.cpp @@ -0,0 +1,415 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace MiniZinc { + +/* + * A Domain in this context consists of at least one pair of int vals. + * The pair encodes a range (min, max). + * If the Domain consists of multiple ranges, they are disjoint. + */ + +class Domain { +public: + class Range { + public: + long long min; + long long max; + + // Constructor + Range(long long minValue, long long maxValue) : min(minValue), max(maxValue) {} + + // Overload less-than operator for sorting + bool operator<(const Range& other) const { return min < other.min; } + }; + +private: + std::vector _ranges; + long long _width = -1; + +public: + const std::vector& getRanges() const { return _ranges; } + + // Constructor + Domain(const std::vector& inputRanges) : _ranges(inputRanges) { + std::sort(_ranges.begin(), _ranges.end()); + size(); + } + + // Merge two domains without altering the originals + static Domain merge(const Domain& d1, const Domain& d2) { + std::vector allRanges = d1._ranges; + allRanges.insert(allRanges.end(), d2._ranges.begin(), d2._ranges.end()); + std::sort(allRanges.begin(), allRanges.end()); + + std::vector mergedRanges; + if (!allRanges.empty()) { + mergedRanges.push_back(allRanges[0]); + } + + for (const auto& range : allRanges) { + Range& lastRange = mergedRanges.back(); + if (range.min <= lastRange.max + 1) { + // Merge overlapping or adjacent ranges + lastRange.max = std::max(lastRange.max, range.max); + } else { + // Add new non-overlapping range + mergedRanges.push_back(range); + } + } + + return Domain(mergedRanges); + } + + // Convert a domain encoded as IntSetVal into a Domain class object + static inline Domain from(IntSetVal& set) { + std::vector ranges; + for (int i = 0; i < set.size(); i++) { + ranges.push_back({set.min(i).toInt(), set.max(i).toInt()}); + } + return Domain(ranges); + } + + void printRanges() const { + for (const auto& range : _ranges) { + std::cout << "Range: [" << range.min << ", " << range.max << "]\n"; + } + } + + // width = Sum (r.max - r.min + 1) for all ranges r in the domain + long long inline width() { + if (_width == -1) { + long long size = 0; + for (const auto& range : _ranges) { + size += (range.max - range.min + 1); + } + _width = size; + } + + return _width; + } + + // Number of ranges in the domain + int inline size() const { return _ranges.size(); } +}; + +class BipartiteGraph { +private: + int uSize; // Number of vertices in set U + int vSize; // Number of vertices in set V + bool uAutoResize; + bool vAutoResize; + std::vector> adjacencyMatrix; // Adjacency matrix +public: + // Constructor to initialize the graph with sizes of U and V + BipartiteGraph(int uSize, int vSize) + : uSize(uSize), + vSize(vSize), + uAutoResize(uSize == 0), + vAutoResize(vSize == 0), + adjacencyMatrix(uSize, std::vector(vSize, 0)) {} + + // Method to add an edge between vertex u in U and vertex v in V + // We can not use iterators to determine the num of vardecls and constraints + void addEdge(int u, int v) { + if (u < 0 || v < 0) { + std::cerr << "Invalid vertex index." << std::endl; + return; + } + + // Resize the matrix if necessary & allowed + if (uAutoResize && u >= uSize) { + adjacencyMatrix.resize(u + 1, std::vector(vSize, 0)); + uSize = u + 1; + } + if (vAutoResize && v >= vSize) { + for (auto& row : adjacencyMatrix) { + row.resize(v + 1, 0); + } + vSize = v + 1; + } + + // we will only enter this block if: + // autoResize is on || the index is within predefined bounds + // all other values will be dropped + if (u < uSize && v < vSize) { + adjacencyMatrix[u][v] = 1; + } + } + + std::string formatMatrix() const { + std::ostringstream oss; + for (int i = 0; i < adjacencyMatrix.size(); i++) { + const auto& row = adjacencyMatrix[i]; + for (const auto& cell : row) { + oss << cell; + } + if (i != adjacencyMatrix.size() - 1) { + oss << "|"; + } + } + return oss.str(); + } + + std::vector> getAdjecencyMatrix() const { + return adjacencyMatrix; + } + + int currentVSize() const { return vSize;} +}; + + +static std::vector calculate_domain_widths(std::vector& domains) { + std::vector domain_sizes; + + for (auto& d : domains) { + domain_sizes.push_back(d.width()); + } + + return domain_sizes; +} + +static std::vector domain_overlap_avgs(std::vector& domains) { + std::vector domain_overlaps; + + // pair the domains up for comparison + for (int i = 0; i < domains.size(); i++) { + auto& d1 = domains[i]; + for (int j = i + 1; j < domains.size(); j++) { + auto& d2 = domains[j]; + auto merged_domains = Domain::merge(d1, d2); + // the sum(range_overlaps) <==> domain_overlap + std::vector range_overlaps; + // go over the individual ranges of the first domain + for (int ii = 0; ii < d1.size(); ii++) { + auto& p1 = d1.getRanges()[ii]; + // go over the individual ranges of the second domain + for (int jj = 0; jj < d2.size(); jj++) { + auto& p2 = d2.getRanges()[jj]; + auto overlap_start = std::max(p1.min, p2.min); + auto overlap_end = std::min(p1.max, p2.max); + if (overlap_start <= overlap_end) { + auto range_overlap = overlap_end - overlap_start + 1; + range_overlaps.push_back(range_overlap / (double)merged_domains.width()); + } + } + } + // adds a 0 in case the domain did not overlap + double total_domain_overlap = std::accumulate(range_overlaps.begin(), range_overlaps.end(), 0.0); + domain_overlaps.push_back(total_domain_overlap); + } + } + + return domain_overlaps; +} + +static bool is_var_defined_by_call(Call* call, EnvI& envi, Id* var) { + + auto& ann = Expression::ann(call); + std::vector removeAnns; + for (ExpressionSetIter anns = ann.begin(); anns != ann.end(); ++anns) { + if (Call* c = Expression::dynamicCast(*anns)) { + if (c->id() == envi.constants.ann.defines_var) { + for (unsigned int i = 0; i < c->argCount(); i++) { + auto a = c->arg(i); + if (Expression::isa(a)) { + auto id = Expression::cast(a); + if (Expression::equal(id, var)) { + return true; + } + } + } + } + } + } + return false; +} + +static bool is_call_using_var_defined_by_other(Call* call, EnvI& envi, Id* var) { + bool isDefined = Expression::ann(var->decl()).contains(envi.constants.ann.is_defined_var); + return (isDefined && !is_var_defined_by_call(call, envi, var)); +} + +static void add_to_constraint_histogram(FlatModelFeatureVector& features, const char* constraintName) { + features.ct_histogram[constraintName]++; +} + +static void add_to_annotation_histogram(FlatModelFeatureVector& features, Expression* annotations) { + for (auto ann : Expression::ann(annotations)) { + if (Expression::isa(ann)) { + const Id* ident = Expression::cast(ann); + if (ident->decl() != nullptr) { + ident = ident->decl()->id(); + } + if (ident->idn() == -1) { + features.ann_histogram[ident->v().c_str()]++; + } + } + } +} + +static double average_decision_variables_in_constraints(BipartiteGraph& constraintGraph) { + double result = 0; + if (constraintGraph.currentVSize() > 0) { + for (auto& row : constraintGraph.getAdjecencyMatrix()) { + result += mean(row); + } + } + return result; +} + +FlatModelFeatureVector extract_feature_vector(Env& m, FlatModelFeatureVector::Options& o) { + Model* flat = m.flat(); + FlatModelFeatureVector features; + std::vector domains; + std::map varIdToCustomIdMap; + int varIdCounter = 0; + int constraintIdCounter = 0; + BipartiteGraph constraintGraph = BipartiteGraph(0, 0); + + if (o.vDimensions > 0 || o.cDimensions > 0) { + constraintGraph = BipartiteGraph(o.vDimensions, o.cDimensions); + } + + for (auto& i : *flat) { + if (!i->removed()) { + if (auto* vdi = i->dynamicCast()) { + Type t = vdi->e()->type(); + // iterate over every var decl that is not an array + if (t.isvar() && t.dim() == 0) { + if (t.isSet()) { + // todo handle other sets or constraint to intSet + features.n_set_vars++; + Expression* domain = vdi->e()->ti()->domain(); + IntSetVal* bounds = eval_intset(m.envi(), domain); + Domain d = Domain::from(*bounds); + domains.push_back(d); + } else if (t.isint()) { + features.n_int_vars++; + Expression* domain = vdi->e()->ti()->domain(); + if (domain != nullptr) { + IntSetVal* bounds = eval_intset(m.envi(), domain); + Domain d = Domain::from(*bounds); + domains.push_back(d); + } else { + //std::cout << "Nullptr Domain in FeatureExtraction - is this a bug?" << std::endl; //TODO discuss with maintainers + } + } else if (t.isbool()) { + features.n_bool_vars++; + Domain d = Domain({{0, 1}}); + domains.push_back(d); + } else if (t.isfloat()) { + // currently omitted in model training + } + GCLock lock; + varIdToCustomIdMap[vdi->e()->id()->str().c_str()] = varIdCounter; + features.customIdToVarNameMap[varIdCounter++] = vdi->e()->id()->str().c_str(); + add_to_annotation_histogram(features, vdi->e()); + } + } else if (auto* ci = i->dynamicCast()) { + if (Call* call = Expression::dynamicCast(ci->e())) { + if (call->argCount() > 0) { + Type all_t; + auto constraintId = constraintIdCounter++; + const char* constraintName = call->id().c_str(); + // skip everything float related in this section todo make floats optional for feature vector + if (std::strstr(constraintName, "float") != nullptr) { + continue; + } + features.customIdToConstraintNameMap[constraintId] = constraintName; + int foreignDefinedVarsUsedByCall = 0; + add_to_constraint_histogram(features, constraintName); + for (unsigned int i = 0; i < call->argCount(); i++) { + Type t = Expression::type(call->arg(i)); + if (t.isvar()) { + if (t.st() == Type::ST_SET || + (t.bt() == Type::BT_FLOAT && all_t.st() != Type::ST_SET) || + (t.bt() == Type::BT_INT && all_t.bt() != Type::BT_FLOAT && + all_t.st() != Type::ST_SET) || + (t.bt() == Type::BT_BOOL && all_t.bt() != Type::BT_INT && + all_t.bt() != Type::BT_FLOAT && all_t.st() != Type::ST_SET)) { + all_t = t; + + const auto a = call->arg(i); + // add variable argument to constraint graph + if (Expression::isa(a)) { + GCLock lock; + Id* id = Expression::cast(a); + constraintGraph.addEdge(varIdToCustomIdMap[id->str().c_str()], constraintId); + + if (is_call_using_var_defined_by_other(call, m.envi(), id)) { + foreignDefinedVarsUsedByCall++; + } + } + // add array arguments elements to constraint graphs if they are variables + else if (Expression::isa(a)) { + const auto* di = Expression::cast(a); + for (auto v : di->getVec()) { + if (Expression::isa(v)) { + GCLock lock; + Id* id = Expression::cast(v); + if (id->decl() != nullptr) { + id = id->decl()->id(); + } + constraintGraph.addEdge(varIdToCustomIdMap[id->str().c_str()], + constraintId); + + if (is_call_using_var_defined_by_other(call, m.envi(), id)) { + foreignDefinedVarsUsedByCall++; + } + } + } + } + } + } + } + if (all_t.isvar()) { + if (all_t.st() == Type::ST_SET) { + features.n_set_ct++; + } else if (all_t.bt() == Type::BT_INT) { + features.n_int_ct++; + } else if (all_t.bt() == Type::BT_BOOL) { + features.n_bool_ct++; + } else if (all_t.bt() == Type::BT_FLOAT) { + // currently omitted in model training + } + } + if (foreignDefinedVarsUsedByCall > 1) { + features.n_meta_ct++; + } + } + } + } + } + } + + features.constraint_graph = constraintGraph.formatMatrix(); + auto domain_sizes = calculate_domain_widths(domains); + features.domain_widths = domain_sizes; //copying intentional + + features.avg_decision_vars_in_cts = average_decision_variables_in_constraints(constraintGraph); + + if (!domain_sizes.empty()) { + features.std_dev_domain_size = std::round(stdDev(domain_sizes) * 1000) / 1000.0; + + double d = mean(domain_sizes); + features.avg_domain_size = mean(domain_sizes); + + std::sort(domain_sizes.begin(), domain_sizes.end()); + features.median_domain_size = domain_sizes[domain_sizes.size() / 2]; + + auto overlaps = domain_overlap_avgs(domains); + features.n_disjoint_domain_pairs = std::count(overlaps.begin(), overlaps.end(), 0.0); + + features.avg_domain_overlap = mean(overlaps); + } + features.n_total_ct += features.n_set_ct + features.n_int_ct + features.n_bool_ct; + return features; +} + +} // namespace MiniZinc \ No newline at end of file diff --git a/lib/flatten.cpp b/lib/flatten.cpp index b56adbc9b..c1ad603f7 100644 --- a/lib/flatten.cpp +++ b/lib/flatten.cpp @@ -5671,6 +5671,7 @@ FlatModelStatistics statistics(Env& m) { stats.n_imp_ct = m.envi().counters.impConstraints; stats.n_imp_del = m.envi().counters.impDel; stats.n_lin_del = m.envi().counters.linDel; + for (auto& i : *flat) { if (!i->removed()) { if (auto* vdi = i->dynamicCast()) { diff --git a/lib/flattener.cpp b/lib/flattener.cpp index 9d36a4b43..cf251a624 100644 --- a/lib/flattener.cpp +++ b/lib/flattener.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -896,6 +897,7 @@ void Flattener::flatten(const std::string& modelString, const std::string& model if (stats.n_lin_del != 0) { ss.add("eliminatedLinearConstraints", stats.n_lin_del); } + /// Objective / SAT. These messages are used by mzn-test.py. SolveI* solveItem = env->flat()->solveItem(); @@ -912,6 +914,48 @@ void Flattener::flatten(const std::string& modelString, const std::string& model ss.add("flatTime", flatten_time.s()); } + if (_flags.featureVector != nullptr) { + StatisticsStream ss(_os, _flags.encapsulateJSON, "feature_vector", + "%%%mzn-fvec: ", "%%%mzn-fvec-end"); + FlatModelFeatureVector features = extract_feature_vector(*env, *_flags.featureVector); + + if (!_flags.encapsulateJSON) { + _os << "% Generated FlatZinc Feature Vector:\n"; + } + + ss.add("flatBoolVars", features.n_bool_vars); + ss.add("flatIntVars", features.n_int_vars); + ss.add("flatSetVars", features.n_set_vars); + + ss.addMap("idToVarNameMap", features.customIdToVarNameMap); + ss.addMap("idToConstraintNameMap", features.customIdToConstraintNameMap); + + ss.addArray("domainWidths", features.domain_widths); + ss.add("stdDeviationDomain", features.std_dev_domain_size); + ss.add("averageDomainSize", features.avg_domain_size); + ss.add("medianDomainSize", features.median_domain_size); + ss.add("averageDomainOverlap", features.avg_domain_overlap); + ss.add("numberOfDisjointPairs", features.n_disjoint_domain_pairs); + ss.add("metaConstraints", features.n_meta_ct); + ss.add("totalConstraints", features.n_total_ct); + ss.add("avgDecisionVarsInConstraints", features.avg_decision_vars_in_cts); + + ss.add("constraintGraph", features.constraint_graph); + ss.addMap("constraintHistogram", features.ct_histogram); + ss.addMap("annotationHistogram", features.ann_histogram); + + SolveI* solveItem = env->flat()->solveItem(); + if (solveItem->st() != SolveI::SolveType::ST_SAT) { + if (solveItem->st() == SolveI::SolveType::ST_MAX) { + ss.add("method", "maximize"); + } else { + ss.add("method", "minimize"); + } + } else { + ss.add("method", "satisfy"); + } + } + if (_flags.outputPathsStdout) { if (_flags.verbose) { _log << "Printing Paths to stdout ..." << std::endl; diff --git a/lib/solver.cpp b/lib/solver.cpp index 9cee0bb43..116956686 100644 --- a/lib/solver.cpp +++ b/lib/solver.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include @@ -35,6 +36,7 @@ #include #include + #ifdef HAS_OSICBC #include #endif @@ -266,6 +268,7 @@ void MznSolver::printHelp(std::ostream& os, const std::string& selectedSolver) { << " --verbose-compilation\n Print progress/log statements for compilation." << std::endl << " -s, --statistics\n Print statistics." << std::endl << " --compiler-statistics\n Print statistics for compilation." << std::endl + << " --feature-vector, --feature-vector <(\\d*v)?(\\d*c)?(f)?>\n Extracts and prints feature vector of the FlatZinc model.\n Allows configuration of the feature-extraction process.\n [0-9]+v limits variables in constraint graph, [0-9]+c limits constraints in constraint graph, f ignores floats.\n By default uses \"0v0cf\" which will not apply padding / cropping to the constraint graph and will ignore floats." << std::endl << " -c, --compile\n Compile only (do not run solver)." << std::endl << " --config-dirs\n Output configuration directories." << std::endl << " --param-file \n Load parameters from the given JSON file." << std::endl @@ -617,6 +620,13 @@ MznSolver::OptionStatus MznSolver::processOptions(std::vector& argv randomSeed = atoi(argv[i].c_str()); } else if (argv[i] == "--compiler-statistics") { flagCompilerStatistics = true; + } else if (argv[i] == "--feature-vector") { + int j = i + 1; + if (j < argc && FlatModelFeatureVector::Options::is_valid_options_regex(argv[j])) { + featureVectorOptions = FlatModelFeatureVector::Options::parse_from_string(argv[j]); + ++i; + } + flagFeatureVector = true; } else if (argv[i] == "--json-stream") { flagEncapsulateJSON = true; s2out.opt.checkerArgs.emplace_back("--json-stream"); @@ -949,6 +959,9 @@ void MznSolver::flatten(const std::string& modelString, const std::string& model if (flagRandomSeed) { _flt.setRandomSeed(randomSeed); } + if (flagFeatureVector) { + _flt.setFlagFeatureVector(&featureVectorOptions); + } #ifndef __EMSCRIPTEN__ // Create timing thread diff --git a/lib/statistics.cpp b/lib/statistics.cpp index 2e8e8ea7e..772623865 100644 --- a/lib/statistics.cpp +++ b/lib/statistics.cpp @@ -15,15 +15,22 @@ #include #include #include +#include #include +#include namespace MiniZinc { -StatisticsStream::StatisticsStream(std::ostream& os, bool json) - : _os(os), _json(json), _ios(nullptr) { +StatisticsStream::StatisticsStream(std::ostream& os, bool json, std::string jsonType, std::string linePrefix, std::string outputEndMarker) + : _os(os), + _json(json), + _jsonType(jsonType) /*currently in use: statistics, feature_vector*/, + _ios(nullptr), + _prefix(linePrefix), + _endMarker(outputEndMarker) { _ios.copyfmt(os); if (_json) { - _os << "{\"type\": \"statistics\", \"statistics\": {"; + _os << "{\"type\": \"" << _jsonType << "\", \"" << _jsonType << "\": {"; } } @@ -31,7 +38,7 @@ StatisticsStream::~StatisticsStream() { if (_json) { _os << "}}\n"; } else { - _os << "%%%mzn-stat-end\n"; + _os << _endMarker << std::endl; } _os.copyfmt(_ios); } @@ -48,18 +55,25 @@ void StatisticsStream::precision(std::streamsize prec, bool fixed) { void StatisticsStream::add(const std::string& stat, const Expression& value) { addInternal(stat, value); } + void StatisticsStream::add(const std::string& stat, int value) { addInternal(stat, value); } + void StatisticsStream::add(const std::string& stat, unsigned int value) { addInternal(stat, value); } + void StatisticsStream::add(const std::string& stat, long value) { addInternal(stat, value); } + void StatisticsStream::add(const std::string& stat, unsigned long value) { addInternal(stat, value); } + void StatisticsStream::add(const std::string& stat, long long value) { addInternal(stat, value); } + void StatisticsStream::add(const std::string& stat, unsigned long long value) { addInternal(stat, value); } + void StatisticsStream::add(const std::string& stat, double value) { if (std::isfinite(value)) { addInternal(stat, value); @@ -72,9 +86,11 @@ void StatisticsStream::add(const std::string& stat, double value) { } } } + void StatisticsStream::add(const std::string& stat, const std::string& value) { addInternal(stat, "\"" + Printer::escapeStringLit(value) + "\""); } + void StatisticsStream::addRaw(const std::string& stat, const std::string& value) { addInternal(stat, value); }