src/autotvm/touch_extractor.cc - tvm - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file touch_extractor.cc
  * \brief Extract feature of touch pattern of axes in lowered IR
  */

 #include "touch_extractor.h"

 #include <algorithm>
 #include <cmath>
 #include <set>
 #include <unordered_map>

 namespace tvm {
 namespace autotvm {

 int ParallelLevel(AnnotationType ann) {
   switch (ann) {
     case kBlockX:
     case kBlockY:
     case kBlockZ:
       return 2;
     case kThreadX:
     case kThreadY:
     case kThreadZ:
     case kParallel:
       return 1;
     default:
       return 0;
   }
 }

 // get touch pattern from index expression
 class IndexParser : public ExprVisitor {
  public:
   void Parse(PrimExpr expr) {
     pattern_map.clear();
     this->VisitExpr(expr);
   }

   void VisitExpr_(const VarNode* op) final {
     // TODO(lmzheng): handle more index types (multiple occurrence)
     if (pattern_map.count(op) == 0) {
       pattern_map[op] = TouchPattern();
       pattern_map[op].stride = next_stride_;
       next_stride_ = 1;
     }
   }

   void VisitExpr_(const MulNode* op) final {
     if (op->a.as<VarNode>()) {
       if (const auto stride = op->b.as<IntImmNode>()) {
         next_stride_ = stride->value;
       }
     }
     ExprVisitor::VisitExpr_(op);
   }

   std::unordered_map<const VarNode*, TouchPattern> pattern_map;

  private:
   int64_t next_stride_ = 1;
 };

 // extract iter vars and their touch pattern from ir
 bool TouchExtractor::EnterItervar_(Var var, int64_t length, AnnotationType ann_type) {
   // do not insert duplicated occurrences of virtual thread
   if (ann_type == kVirtualThread && itervar_map.count(var) != 0) {
     skip_stack_size_.push_back(itervar_stack_.size());
     return true;
   } else {
     itervar_stack_.push_back(var);
     topdown_product_ *= length;

     if (itervar_map.count(var) != 0) {
       // find two duplicated axes
       // these happens when we create tvm.thread_axis("threadIdx.x") once and
       // bind it twice. Here we treat them as two axes
       // so we create a snapshot for the old one and freeze it
       Var old = Var(var.get()->name_hint);
       itervar_map.insert({old, itervar_map[var]});
       itervar_map.erase(var);
     }

     itervar_map.insert(
         {var, ItervarFeature(var, length, static_cast<int>(itervar_stack_.size()), ann_type,
                              topdown_product_, static_cast<int>(itervar_counter_++))});
   }

   return true;
 }

 void TouchExtractor::ExitItervar_() {
   if (!skip_stack_size_.empty() && skip_stack_size_.back() == itervar_stack_.size()) {
     skip_stack_size_.pop_back();
     return;
   }
   Var var = itervar_stack_.back();

   // update count and reuse ratio for upper iter vars (includes self)
   for (auto kv : itervar_map[var].touch_feature) {
     if (kv.second.stride != 0) {  // multiply count
       for (auto stack_var : itervar_stack_) {
         auto touch_pattern = itervar_map[stack_var].touch_feature.find(kv.first);
         CHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
         touch_pattern->second.count *= itervar_map[var].length;
       }
     } else {  // multiply reuse ratio
       for (auto stack_var : itervar_stack_) {
         auto touch_pattern = itervar_map[stack_var].touch_feature.find(kv.first);
         CHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
         touch_pattern->second.reuse *= itervar_map[var].length;
       }
     }
   }
   itervar_stack_.pop_back();

   int64_t length = itervar_map[var].length;
   if (length != 0) topdown_product_ /= length;
   int64_t bottomup_product = -1;
   for (auto kv : itervar_map[var].touch_feature) {
     bottomup_product = std::max(bottomup_product, kv.second.count * kv.second.reuse);
   }

   itervar_map[var].bottomup_product = bottomup_product;

   // push base to upper parallel axis
   int para_level = ParallelLevel(itervar_map[var].ann);
   // if is the separate line of parallel level, push the base to upper parallel level
   if (!itervar_stack_.empty() &&
       ParallelLevel(itervar_map[itervar_stack_.back()].ann) == para_level + 1) {
     for (auto kv : itervar_map[var].touch_feature) {
       for (auto stack_var : itervar_stack_) {
         if (ParallelLevel(itervar_map[stack_var].ann) == para_level + 1) {
           auto touch_pattern = itervar_map[stack_var].touch_feature.find(kv.first);
           CHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
           touch_pattern->second.thread_reuse = -kv.second.reuse;
           touch_pattern->second.thread_count = -kv.second.count;
           // NOTE: use minus as a flag to denote it is a base,
           // indicating it is not the final value
         }
       }
     }
   }

   for (auto kv : itervar_map[var].touch_feature) {
     if (kv.second.thread_count < 0) {
       itervar_map[var].touch_feature[kv.first].thread_count =
           kv.second.count / (-kv.second.thread_count);
       itervar_map[var].touch_feature[kv.first].thread_reuse =
           kv.second.reuse / (-kv.second.thread_reuse);
     }
   }
 }

 void TouchExtractor::EnterMem_(Var buffer_var, PrimExpr index) {
   std::string name = buffer_var.get()->name_hint;
   TouchedBuffer buf = name + "_" + std::to_string(buffer_counter_[name]++);

   // extract touch pattern from index
   IndexParser parser;
   parser.Parse(index);

   // push up mem access info
   for (auto var : itervar_stack_) {
     auto x = parser.pattern_map.find(var.get());
     if (x != parser.pattern_map.end()) {
       itervar_map[var].touch_feature[buf] = x->second;
     } else {
       itervar_map[var].touch_feature[buf] = TouchPattern();
     }
   }
 }

 void TouchExtractor::ExitMem_() {}

 /*!
  * \brief Get axis-based feature for all axes
  * \param stmt The statement to be extracted
  * \param bool Whether take log for numerical feature
  * \param ret_feature The buffer where the return value is stored
  *
  * \note The format of return value is
  * ((
  *   ('_itervar_',  var),
  *   ('_attr_',     length, nest_level, topdown, bottomup, one_hot_annotation),
  *   ('_arith_',    add_ct, mul_ct, div_ct),
  *   ('data_vec_0', stride, mod, count, reuse, thread_count, thread_reuse),
  *   ('conv_0',     stride, mod, count, reuse, thread_count, thread_reuse),
  * ),
  * (
  *   ('_itervar_',    var2),
  *   ('_attr_',       length, nest_level, one_hot_annotation),
  *   ('_arith_',      add_ct, mul_ct, div_ct),
  *   ('kernel_vec_0', stride, mod, count, reuse, thread_count, thread_reuse),
  *   ('conv_1',       stride, mod, count, reuse, thread_count, thread_reuse),
  * ))
  *
  * Itervars are sorted according to their first occurrence position in IR.
  * Buffers touched by an itervar are sorted by their unique names.
  *
  * \note If you want to flatten these features as the input of your model,
  * You can use the faster one GetItervarFeatureFlatten below.
  */
 void GetItervarFeature(Stmt stmt, bool take_log, Array<Array<Array<PrimExpr> > >* ret_feature) {
   // extract
   TouchExtractor touch_analyzer;
   touch_analyzer.Analyze(stmt);

   // sort according to order
   std::vector<Var> vars;
   for (auto kv : touch_analyzer.itervar_map) {
     vars.push_back(kv.first);
   }
   std::sort(vars.begin(), vars.end(), [&](const Var& lhs, const Var& rhs) -> bool {
     return touch_analyzer.itervar_map[lhs].order < touch_analyzer.itervar_map[rhs].order;
   });

   // whether take log for numerical feature
   std::function<double(int64_t)> trans;
   if (take_log) {
     trans = [](int64_t x) {
       if (x < 0) return -std::log(-x + 1) / std::log(2);
       x = x + 1;
       return std::log(x) / std::log(2);
     };
   } else {
     trans = [](int64_t x) { return x; };
   }

   // serialize for front end
   for (auto var : vars) {
     Array<Array<PrimExpr> > feature_row;
     ItervarFeature& fea = touch_analyzer.itervar_map[var];
     feature_row.push_back(Array<PrimExpr>{tvm::tir::StringImm("_itervar_"), var});

     Array<PrimExpr> attr{
         tvm::tir::StringImm("_attr_"),
         FloatImm(DataType::Float(32), trans(fea.length)),
         IntImm(DataType::Int(32), fea.nest_level),
         FloatImm(DataType::Float(32), trans(fea.topdown_product)),
         FloatImm(DataType::Float(32), trans(fea.bottomup_product)),
     };
     // one hot annotation
     for (int i = 0; i < kNum; i++) {
       attr.push_back(i == fea.ann);
     }
     feature_row.push_back(attr);

     // arithmetic
     feature_row.push_back(Array<PrimExpr>{
         tvm::tir::StringImm("_arith_"),
         FloatImm(DataType::Float(32), trans(fea.add_ct)),
         FloatImm(DataType::Float(32), trans(fea.mul_ct)),
         FloatImm(DataType::Float(32), trans(fea.div_ct)),
     });

     // touch map
     std::vector<TouchedBuffer> bufs;
     for (auto kv : fea.touch_feature) {
       bufs.push_back(kv.first);
     }
     std::sort(bufs.begin(), bufs.end());
     for (auto k : bufs) {
       TouchPattern& v = fea.touch_feature[k];
       feature_row.push_back(Array<PrimExpr>{
           tvm::tir::StringImm(k),
           FloatImm(DataType::Float(32), trans(v.stride)),
           FloatImm(DataType::Float(32), trans(v.mod)),
           FloatImm(DataType::Float(32), trans(v.count)),
           FloatImm(DataType::Float(32), trans(v.reuse)),
           FloatImm(DataType::Float(32), trans(v.thread_count)),
           FloatImm(DataType::Float(32), trans(v.thread_reuse)),
       });
     }

     ret_feature->push_back(feature_row);
   }
 }

 /*!
  * \brief Get axis-based feature for all axes and flatten them into a one-dimensional vector.
  * \param stmt The statement to be extracted
  * \param bool Whether take log for numerical feature
  * \param ret_feature The buffer where the return value is stored
  *
  * \note See GetItervarFeature for more details about the return value.
  *       This is an optimized version of GetItervarFeature + Flatten. This runs much faster.
  */
 void GetItervarFeatureFlatten(Stmt stmt, bool take_log, std::vector<float>* ret_feature) {
   // extract touch feature
   TouchExtractor touch_analyzer;
   touch_analyzer.Analyze(stmt);

   // sort according to order
   std::vector<Var> vars;
   for (auto kv : touch_analyzer.itervar_map) {
     vars.push_back(kv.first);
   }
   std::sort(vars.begin(), vars.end(), [&](const Var& lhs, const Var& rhs) -> bool {
     return touch_analyzer.itervar_map[lhs].order < touch_analyzer.itervar_map[rhs].order;
   });

   // whether take log for numerical feature
   std::function<float(int64_t)> trans;
   if (take_log) {
     trans = [](int64_t x) {
       if (x < 0) return -std::log(-x + 1) / std::log(2);
       x = x + 1;
       return std::log(x) / std::log(2);
     };
   } else {
     trans = [](int64_t x) { return x; };
   }

   // serialize for front end
   for (auto var : vars) {
     ItervarFeature& fea = touch_analyzer.itervar_map[var];

     ret_feature->push_back(trans(fea.length));
     ret_feature->push_back(fea.nest_level);
     ret_feature->push_back(trans(fea.topdown_product));
     ret_feature->push_back(trans(fea.bottomup_product));

     // one hot annotation
     for (int i = 0; i < kNum; i++) {
       ret_feature->push_back(i == fea.ann);
     }

     // arithmetic
     ret_feature->push_back(trans(fea.add_ct));
     ret_feature->push_back(trans(fea.mul_ct));
     ret_feature->push_back(trans(fea.div_ct));

     // touch map
     std::vector<TouchedBuffer> bufs;
     for (auto kv : fea.touch_feature) {
       bufs.push_back(kv.first);
     }
     std::sort(bufs.begin(), bufs.end());
     for (auto k : bufs) {
       TouchPattern& v = fea.touch_feature[k];
       ret_feature->push_back(trans(v.stride));
       ret_feature->push_back(trans(v.mod));
       ret_feature->push_back(trans(v.count));
       ret_feature->push_back(trans(v.reuse));
       ret_feature->push_back(trans(v.thread_count));
       ret_feature->push_back(trans(v.thread_reuse));
     }
   }
 }

 /*!
  * \brief Get curve sample feature (relation feature) and flatten them into a one-dimensional
  * vector. \param stmt The statement to be extracted \param sample_n The number of points used for
  * sampling a curve (along one dimension) \param ret_feature The buffer where the return value is
  * stored
  */
 void GetCurveSampleFeatureFlatten(Stmt stmt, int sample_n, std::vector<float>* ret_feature) {
   // extract touch feature
   TouchExtractor touch_ext;
   touch_ext.Analyze(stmt);

   // sort according to order
   std::vector<Var> vars;
   for (auto kv : touch_ext.itervar_map) {
     vars.push_back(kv.first);
   }
   std::sort(vars.begin(), vars.end(), [&](const Var& lhs, const Var& rhs) -> bool {
     return touch_ext.itervar_map[lhs].order < touch_ext.itervar_map[rhs].order;
   });

   int max_depth = 0;
   std::map<TouchedBuffer, std::vector<double> > reuse_curve;
   std::map<TouchedBuffer, std::vector<double> > count_curve;
   std::map<TouchedBuffer, std::vector<double> > topdown_curve;
   std::map<TouchedBuffer, std::vector<double> > bottomup_curve;
   std::set<TouchedBuffer> innermost_buffers;
   std::set<std::string> added;

   // find maximum depth of loop nest
   for (auto var : vars) {
     ItervarFeature& fea = touch_ext.itervar_map[var];
     max_depth = std::max(max_depth, fea.nest_level);
   }

   // mark inner most buffer
   for (auto iter = vars.rbegin(); iter != vars.rend(); iter++) {
     auto var = *iter;
     ItervarFeature& fea = touch_ext.itervar_map[var];
     if (fea.nest_level == max_depth) {
       for (auto kv : fea.touch_feature) {
         // delete buffer no (e.g. 'A_0' -> 'A', 'A_1' -> 'A')
         std::string raw_name = kv.first.substr(0, kv.first.rfind("_"));

         // delete memory scope (e.g. 'A.local' -> 'A', 'A.shared' -> 'A')
         size_t pos = raw_name.find(".");
         if (pos < kv.first.size()) raw_name = raw_name.substr(0, pos);

         // If there are multiple innermost buffers that are derived from a same raw buffer
         // We only record the last occurrence (note the `iter` is in reverse order)
         // e.g. `A.local`, `A.shared` are derived from `A`, if they all occurred at the inner most
         // level, we will only record the last occurrence,
         if (added.find(raw_name) == added.end()) {
           innermost_buffers.insert(kv.first);
           added.insert(raw_name);
         }
       }
     }
   }

   // pad the first point (zero) for all curves
   for (auto buf : innermost_buffers) {
     reuse_curve[buf].push_back(0);
     count_curve[buf].push_back(0);
     topdown_curve[buf].push_back(0);
     bottomup_curve[buf].push_back(0);
   }

   // extract curves
   for (auto var : vars) {
     ItervarFeature& fea = touch_ext.itervar_map[var];
     for (auto kv : fea.touch_feature) {
       if (innermost_buffers.find(kv.first) != innermost_buffers.end()) {
         reuse_curve[kv.first].emplace_back(std::log(kv.second.reuse) / std::log(2));
         count_curve[kv.first].emplace_back(std::log(kv.second.count) / std::log(2));
         topdown_curve[kv.first].emplace_back(std::log(fea.topdown_product) / std::log(2));
         bottomup_curve[kv.first].emplace_back(std::log(fea.bottomup_product) / std::log(2));
       }
     }
   }

   // sample relation in the curve
   auto sample_curve = [&](const std::vector<double>& x, const std::vector<double>& y,
                           double weight) {
     for (int i = 0; i < sample_n; i++) {
       double xx = i * weight;
       for (int j = static_cast<int>(x.size()) - 1; j >= 0; j--) {
         if (xx > x[j] - 1e-6) {
           ret_feature->emplace_back(y[j]);
           ret_feature->emplace_back(xx - x[j]);
           break;
         }
       }
     }
   };

   // serialize to frontend
   for (auto k : innermost_buffers) {
     std::vector<double>& count = count_curve[k];
     std::vector<double>& reuse = reuse_curve[k];
     std::vector<double>& top_down = topdown_curve[k];

     std::sort(count.begin(), count.end());
     std::sort(reuse.begin(), reuse.end());
     std::sort(top_down.begin(), top_down.end());

     sample_curve(count, reuse, 1);
     sample_curve(reuse, count, 1);
     sample_curve(count, top_down, 1);
     sample_curve(top_down, count, 1);
   }
 }

 // register API for front end
 TVM_REGISTER_GLOBAL("autotvm.feature.GetItervarFeature")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       Stmt stmt = args[0];
       bool take_log = args[1];
       Array<Array<Array<PrimExpr> > > ret_feature;

       GetItervarFeature(stmt, take_log, &ret_feature);

       *ret = ret_feature;
     });

 TVM_REGISTER_GLOBAL("autotvm.feature.GetItervarFeatureFlatten")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       Stmt stmt = args[0];
       bool take_log = args[1];
       std::vector<float> ret_feature;

       GetItervarFeatureFlatten(stmt, take_log, &ret_feature);

       TVMByteArray arr;
       arr.size = sizeof(float) * ret_feature.size();
       arr.data = reinterpret_cast<char*>(ret_feature.data());
       *ret = arr;
     });

 TVM_REGISTER_GLOBAL("autotvm.feature.GetCurveSampleFeatureFlatten")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       Stmt stmt = args[0];
       int sample_n = args[1];
       std::vector<float> ret_feature;

       GetCurveSampleFeatureFlatten(stmt, sample_n, &ret_feature);

       TVMByteArray arr;
       arr.size = sizeof(float) * ret_feature.size();
       arr.data = reinterpret_cast<char*>(ret_feature.data());
       *ret = arr;
     });

 }  // namespace autotvm
 }  // namespace tvm
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file touch_extractor.cc
	* \brief Extract feature of touch pattern of axes in lowered IR
	*/

	#include "touch_extractor.h"

	#include <algorithm>
	#include <cmath>
	#include <set>
	#include <unordered_map>

	namespace tvm {
	namespace autotvm {

	int ParallelLevel(AnnotationType ann) {
	switch (ann) {
	case kBlockX:
	case kBlockY:
	case kBlockZ:
	return 2;
	case kThreadX:
	case kThreadY:
	case kThreadZ:
	case kParallel:
	return 1;
	default:
	return 0;
	}
	}

	// get touch pattern from index expression
	class IndexParser : public ExprVisitor {
	public:
	void Parse(PrimExpr expr) {
	pattern_map.clear();
	this->VisitExpr(expr);
	}

	void VisitExpr_(const VarNode* op) final {
	// TODO(lmzheng): handle more index types (multiple occurrence)
	if (pattern_map.count(op) == 0) {
	pattern_map[op] = TouchPattern();
	pattern_map[op].stride = next_stride_;
	next_stride_ = 1;
	}
	}

	void VisitExpr_(const MulNode* op) final {
	if (op->a.as<VarNode>()) {
	if (const auto stride = op->b.as<IntImmNode>()) {
	next_stride_ = stride->value;
	}
	}
	ExprVisitor::VisitExpr_(op);
	}

	std::unordered_map<const VarNode*, TouchPattern> pattern_map;

	private:
	int64_t next_stride_ = 1;
	};

	// extract iter vars and their touch pattern from ir
	bool TouchExtractor::EnterItervar_(Var var, int64_t length, AnnotationType ann_type) {
	// do not insert duplicated occurrences of virtual thread
	if (ann_type == kVirtualThread && itervar_map.count(var) != 0) {
	skip_stack_size_.push_back(itervar_stack_.size());
	return true;
	} else {
	itervar_stack_.push_back(var);
	topdown_product_ *= length;

	if (itervar_map.count(var) != 0) {
	// find two duplicated axes
	// these happens when we create tvm.thread_axis("threadIdx.x") once and
	// bind it twice. Here we treat them as two axes
	// so we create a snapshot for the old one and freeze it
	Var old = Var(var.get()->name_hint);
	itervar_map.insert({old, itervar_map[var]});
	itervar_map.erase(var);
	}

	itervar_map.insert(
	{var, ItervarFeature(var, length, static_cast<int>(itervar_stack_.size()), ann_type,
	topdown_product_, static_cast<int>(itervar_counter_++))});
	}

	return true;
	}

	void TouchExtractor::ExitItervar_() {
	if (!skip_stack_size_.empty() && skip_stack_size_.back() == itervar_stack_.size()) {
	skip_stack_size_.pop_back();
	return;
	}
	Var var = itervar_stack_.back();

	// update count and reuse ratio for upper iter vars (includes self)
	for (auto kv : itervar_map[var].touch_feature) {
	if (kv.second.stride != 0) { // multiply count
	for (auto stack_var : itervar_stack_) {
	auto touch_pattern = itervar_map[stack_var].touch_feature.find(kv.first);
	CHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
	touch_pattern->second.count *= itervar_map[var].length;
	}
	} else { // multiply reuse ratio
	for (auto stack_var : itervar_stack_) {
	auto touch_pattern = itervar_map[stack_var].touch_feature.find(kv.first);
	CHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
	touch_pattern->second.reuse *= itervar_map[var].length;
	}
	}
	}
	itervar_stack_.pop_back();

	int64_t length = itervar_map[var].length;
	if (length != 0) topdown_product_ /= length;
	int64_t bottomup_product = -1;
	for (auto kv : itervar_map[var].touch_feature) {
	bottomup_product = std::max(bottomup_product, kv.second.count * kv.second.reuse);
	}

	itervar_map[var].bottomup_product = bottomup_product;

	// push base to upper parallel axis
	int para_level = ParallelLevel(itervar_map[var].ann);
	// if is the separate line of parallel level, push the base to upper parallel level
	if (!itervar_stack_.empty() &&
	ParallelLevel(itervar_map[itervar_stack_.back()].ann) == para_level + 1) {
	for (auto kv : itervar_map[var].touch_feature) {
	for (auto stack_var : itervar_stack_) {
	if (ParallelLevel(itervar_map[stack_var].ann) == para_level + 1) {
	auto touch_pattern = itervar_map[stack_var].touch_feature.find(kv.first);
	CHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
	touch_pattern->second.thread_reuse = -kv.second.reuse;
	touch_pattern->second.thread_count = -kv.second.count;
	// NOTE: use minus as a flag to denote it is a base,
	// indicating it is not the final value
	}
	}
	}
	}

	for (auto kv : itervar_map[var].touch_feature) {
	if (kv.second.thread_count < 0) {
	itervar_map[var].touch_feature[kv.first].thread_count =
	kv.second.count / (-kv.second.thread_count);
	itervar_map[var].touch_feature[kv.first].thread_reuse =
	kv.second.reuse / (-kv.second.thread_reuse);
	}
	}
	}

	void TouchExtractor::EnterMem_(Var buffer_var, PrimExpr index) {
	std::string name = buffer_var.get()->name_hint;
	TouchedBuffer buf = name + "_" + std::to_string(buffer_counter_[name]++);

	// extract touch pattern from index
	IndexParser parser;
	parser.Parse(index);

	// push up mem access info
	for (auto var : itervar_stack_) {
	auto x = parser.pattern_map.find(var.get());
	if (x != parser.pattern_map.end()) {
	itervar_map[var].touch_feature[buf] = x->second;
	} else {
	itervar_map[var].touch_feature[buf] = TouchPattern();
	}
	}
	}

	void TouchExtractor::ExitMem_() {}

	/*!
	* \brief Get axis-based feature for all axes
	* \param stmt The statement to be extracted
	* \param bool Whether take log for numerical feature
	* \param ret_feature The buffer where the return value is stored
	*
	* \note The format of return value is
	* ((
	* ('_itervar_', var),
	* ('_attr_', length, nest_level, topdown, bottomup, one_hot_annotation),
	* ('_arith_', add_ct, mul_ct, div_ct),
	* ('data_vec_0', stride, mod, count, reuse, thread_count, thread_reuse),
	* ('conv_0', stride, mod, count, reuse, thread_count, thread_reuse),
	* ),
	* (
	* ('_itervar_', var2),
	* ('_attr_', length, nest_level, one_hot_annotation),
	* ('_arith_', add_ct, mul_ct, div_ct),
	* ('kernel_vec_0', stride, mod, count, reuse, thread_count, thread_reuse),
	* ('conv_1', stride, mod, count, reuse, thread_count, thread_reuse),
	* ))
	*
	* Itervars are sorted according to their first occurrence position in IR.
	* Buffers touched by an itervar are sorted by their unique names.
	*
	* \note If you want to flatten these features as the input of your model,
	* You can use the faster one GetItervarFeatureFlatten below.
	*/
	void GetItervarFeature(Stmt stmt, bool take_log, Array<Array<Array<PrimExpr> > >* ret_feature) {
	// extract
	TouchExtractor touch_analyzer;
	touch_analyzer.Analyze(stmt);

	// sort according to order
	std::vector<Var> vars;
	for (auto kv : touch_analyzer.itervar_map) {
	vars.push_back(kv.first);
	}
	std::sort(vars.begin(), vars.end(), [&](const Var& lhs, const Var& rhs) -> bool {
	return touch_analyzer.itervar_map[lhs].order < touch_analyzer.itervar_map[rhs].order;
	});

	// whether take log for numerical feature
	std::function<double(int64_t)> trans;
	if (take_log) {
	trans = [](int64_t x) {
	if (x < 0) return -std::log(-x + 1) / std::log(2);
	x = x + 1;
	return std::log(x) / std::log(2);
	};
	} else {
	trans = [](int64_t x) { return x; };
	}

	// serialize for front end
	for (auto var : vars) {
	Array<Array<PrimExpr> > feature_row;
	ItervarFeature& fea = touch_analyzer.itervar_map[var];
	feature_row.push_back(Array<PrimExpr>{tvm::tir::StringImm("_itervar_"), var});

	Array<PrimExpr> attr{
	tvm::tir::StringImm("_attr_"),
	FloatImm(DataType::Float(32), trans(fea.length)),
	IntImm(DataType::Int(32), fea.nest_level),
	FloatImm(DataType::Float(32), trans(fea.topdown_product)),
	FloatImm(DataType::Float(32), trans(fea.bottomup_product)),
	};
	// one hot annotation
	for (int i = 0; i < kNum; i++) {
	attr.push_back(i == fea.ann);
	}
	feature_row.push_back(attr);

	// arithmetic
	feature_row.push_back(Array<PrimExpr>{
	tvm::tir::StringImm("_arith_"),
	FloatImm(DataType::Float(32), trans(fea.add_ct)),
	FloatImm(DataType::Float(32), trans(fea.mul_ct)),
	FloatImm(DataType::Float(32), trans(fea.div_ct)),
	});

	// touch map
	std::vector<TouchedBuffer> bufs;
	for (auto kv : fea.touch_feature) {
	bufs.push_back(kv.first);
	}
	std::sort(bufs.begin(), bufs.end());
	for (auto k : bufs) {
	TouchPattern& v = fea.touch_feature[k];
	feature_row.push_back(Array<PrimExpr>{
	tvm::tir::StringImm(k),
	FloatImm(DataType::Float(32), trans(v.stride)),
	FloatImm(DataType::Float(32), trans(v.mod)),
	FloatImm(DataType::Float(32), trans(v.count)),
	FloatImm(DataType::Float(32), trans(v.reuse)),
	FloatImm(DataType::Float(32), trans(v.thread_count)),
	FloatImm(DataType::Float(32), trans(v.thread_reuse)),
	});
	}

	ret_feature->push_back(feature_row);
	}
	}

	/*!
	* \brief Get axis-based feature for all axes and flatten them into a one-dimensional vector.
	* \param stmt The statement to be extracted
	* \param bool Whether take log for numerical feature
	* \param ret_feature The buffer where the return value is stored
	*
	* \note See GetItervarFeature for more details about the return value.
	* This is an optimized version of GetItervarFeature + Flatten. This runs much faster.
	*/
	void GetItervarFeatureFlatten(Stmt stmt, bool take_log, std::vector<float>* ret_feature) {
	// extract touch feature
	TouchExtractor touch_analyzer;
	touch_analyzer.Analyze(stmt);

	// sort according to order
	std::vector<Var> vars;
	for (auto kv : touch_analyzer.itervar_map) {
	vars.push_back(kv.first);
	}
	std::sort(vars.begin(), vars.end(), [&](const Var& lhs, const Var& rhs) -> bool {
	return touch_analyzer.itervar_map[lhs].order < touch_analyzer.itervar_map[rhs].order;
	});

	// whether take log for numerical feature
	std::function<float(int64_t)> trans;
	if (take_log) {
	trans = [](int64_t x) {
	if (x < 0) return -std::log(-x + 1) / std::log(2);
	x = x + 1;
	return std::log(x) / std::log(2);
	};
	} else {
	trans = [](int64_t x) { return x; };
	}

	// serialize for front end
	for (auto var : vars) {
	ItervarFeature& fea = touch_analyzer.itervar_map[var];

	ret_feature->push_back(trans(fea.length));
	ret_feature->push_back(fea.nest_level);
	ret_feature->push_back(trans(fea.topdown_product));
	ret_feature->push_back(trans(fea.bottomup_product));

	// one hot annotation
	for (int i = 0; i < kNum; i++) {
	ret_feature->push_back(i == fea.ann);
	}

	// arithmetic
	ret_feature->push_back(trans(fea.add_ct));
	ret_feature->push_back(trans(fea.mul_ct));
	ret_feature->push_back(trans(fea.div_ct));

	// touch map
	std::vector<TouchedBuffer> bufs;
	for (auto kv : fea.touch_feature) {
	bufs.push_back(kv.first);
	}
	std::sort(bufs.begin(), bufs.end());
	for (auto k : bufs) {
	TouchPattern& v = fea.touch_feature[k];
	ret_feature->push_back(trans(v.stride));
	ret_feature->push_back(trans(v.mod));
	ret_feature->push_back(trans(v.count));
	ret_feature->push_back(trans(v.reuse));
	ret_feature->push_back(trans(v.thread_count));
	ret_feature->push_back(trans(v.thread_reuse));
	}
	}
	}

	/*!
	* \brief Get curve sample feature (relation feature) and flatten them into a one-dimensional
	* vector. \param stmt The statement to be extracted \param sample_n The number of points used for
	* sampling a curve (along one dimension) \param ret_feature The buffer where the return value is
	* stored
	*/
	void GetCurveSampleFeatureFlatten(Stmt stmt, int sample_n, std::vector<float>* ret_feature) {
	// extract touch feature
	TouchExtractor touch_ext;
	touch_ext.Analyze(stmt);

	// sort according to order
	std::vector<Var> vars;
	for (auto kv : touch_ext.itervar_map) {
	vars.push_back(kv.first);
	}
	std::sort(vars.begin(), vars.end(), [&](const Var& lhs, const Var& rhs) -> bool {
	return touch_ext.itervar_map[lhs].order < touch_ext.itervar_map[rhs].order;
	});

	int max_depth = 0;
	std::map<TouchedBuffer, std::vector<double> > reuse_curve;
	std::map<TouchedBuffer, std::vector<double> > count_curve;
	std::map<TouchedBuffer, std::vector<double> > topdown_curve;
	std::map<TouchedBuffer, std::vector<double> > bottomup_curve;
	std::set<TouchedBuffer> innermost_buffers;
	std::set<std::string> added;

	// find maximum depth of loop nest
	for (auto var : vars) {
	ItervarFeature& fea = touch_ext.itervar_map[var];
	max_depth = std::max(max_depth, fea.nest_level);
	}

	// mark inner most buffer
	for (auto iter = vars.rbegin(); iter != vars.rend(); iter++) {
	auto var = *iter;
	ItervarFeature& fea = touch_ext.itervar_map[var];
	if (fea.nest_level == max_depth) {
	for (auto kv : fea.touch_feature) {
	// delete buffer no (e.g. 'A_0' -> 'A', 'A_1' -> 'A')
	std::string raw_name = kv.first.substr(0, kv.first.rfind("_"));

	// delete memory scope (e.g. 'A.local' -> 'A', 'A.shared' -> 'A')
	size_t pos = raw_name.find(".");
	if (pos < kv.first.size()) raw_name = raw_name.substr(0, pos);

	// If there are multiple innermost buffers that are derived from a same raw buffer
	// We only record the last occurrence (note the `iter` is in reverse order)
	// e.g. `A.local`, `A.shared` are derived from `A`, if they all occurred at the inner most
	// level, we will only record the last occurrence,
	if (added.find(raw_name) == added.end()) {
	innermost_buffers.insert(kv.first);
	added.insert(raw_name);
	}
	}
	}
	}

	// pad the first point (zero) for all curves
	for (auto buf : innermost_buffers) {
	reuse_curve[buf].push_back(0);
	count_curve[buf].push_back(0);
	topdown_curve[buf].push_back(0);
	bottomup_curve[buf].push_back(0);
	}

	// extract curves
	for (auto var : vars) {
	ItervarFeature& fea = touch_ext.itervar_map[var];
	for (auto kv : fea.touch_feature) {
	if (innermost_buffers.find(kv.first) != innermost_buffers.end()) {
	reuse_curve[kv.first].emplace_back(std::log(kv.second.reuse) / std::log(2));
	count_curve[kv.first].emplace_back(std::log(kv.second.count) / std::log(2));
	topdown_curve[kv.first].emplace_back(std::log(fea.topdown_product) / std::log(2));
	bottomup_curve[kv.first].emplace_back(std::log(fea.bottomup_product) / std::log(2));
	}
	}
	}

	// sample relation in the curve
	auto sample_curve = [&](const std::vector<double>& x, const std::vector<double>& y,
	double weight) {
	for (int i = 0; i < sample_n; i++) {
	double xx = i * weight;
	for (int j = static_cast<int>(x.size()) - 1; j >= 0; j--) {
	if (xx > x[j] - 1e-6) {
	ret_feature->emplace_back(y[j]);
	ret_feature->emplace_back(xx - x[j]);
	break;
	}
	}
	}
	};

	// serialize to frontend
	for (auto k : innermost_buffers) {
	std::vector<double>& count = count_curve[k];
	std::vector<double>& reuse = reuse_curve[k];
	std::vector<double>& top_down = topdown_curve[k];

	std::sort(count.begin(), count.end());
	std::sort(reuse.begin(), reuse.end());
	std::sort(top_down.begin(), top_down.end());

	sample_curve(count, reuse, 1);
	sample_curve(reuse, count, 1);
	sample_curve(count, top_down, 1);
	sample_curve(top_down, count, 1);
	}
	}

	// register API for front end
	TVM_REGISTER_GLOBAL("autotvm.feature.GetItervarFeature")
	.set_body([](TVMArgs args, TVMRetValue* ret) {
	Stmt stmt = args[0];
	bool take_log = args[1];
	Array<Array<Array<PrimExpr> > > ret_feature;

	GetItervarFeature(stmt, take_log, &ret_feature);

	*ret = ret_feature;
	});

	TVM_REGISTER_GLOBAL("autotvm.feature.GetItervarFeatureFlatten")
	.set_body([](TVMArgs args, TVMRetValue* ret) {
	Stmt stmt = args[0];
	bool take_log = args[1];
	std::vector<float> ret_feature;

	GetItervarFeatureFlatten(stmt, take_log, &ret_feature);

	TVMByteArray arr;
	arr.size = sizeof(float) * ret_feature.size();
	arr.data = reinterpret_cast<char*>(ret_feature.data());
	*ret = arr;
	});

	TVM_REGISTER_GLOBAL("autotvm.feature.GetCurveSampleFeatureFlatten")
	.set_body([](TVMArgs args, TVMRetValue* ret) {
	Stmt stmt = args[0];
	int sample_n = args[1];
	std::vector<float> ret_feature;

	GetCurveSampleFeatureFlatten(stmt, sample_n, &ret_feature);

	TVMByteArray arr;
	arr.size = sizeof(float) * ret_feature.size();
	arr.data = reinterpret_cast<char*>(ret_feature.data());
	*ret = arr;
	});

	} // namespace autotvm
	} // namespace tvm