Building OPs in Tensorflow

In order to accelerate the machine learning operations, the implementation of C++ is preferred than python. In tensorflow, we can compile C++ and call the function in python runtime. The way is shown in [1]. First, we need to get the header directory of tensorflow to compile our code.


Python 3.5.2 (default, Nov 17 2016, 17:05:23)
[GCC 5.4.0 20160609] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import tensorflow as tf
>>> tf.sysconfig.get_include()
'/home/zhuang/Dropbox/Projects/tensorflow_models/env/lib/python3.5/site-packages/tensorflow/include'

The complication is done like this


TF_INC=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')

g++ -std=c++11 -shared zero_out.cc -o zero_out.so -fPIC -I $TF_INC -O2

We use word2vec [2] as another example to show the flow

@word2vec_kernel.cc

class SkipgramWord2vecOp : public OpKernel {
 public:
  explicit SkipgramWord2vecOp(OpKernelConstruction* ctx)
      : OpKernel(ctx), rng_(&philox_) {
    string filename;
    OP_REQUIRES_OK(ctx, ctx->GetAttr("filename", &filename));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("batch_size", &batch_size_));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("window_size", &window_size_));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("min_count", &min_count_));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("subsample", &subsample_));
    OP_REQUIRES_OK(ctx, Init(ctx->env(), filename));

    mutex_lock l(mu_);
    example_pos_ = corpus_size_;
    label_pos_ = corpus_size_;
    label_limit_ = corpus_size_;
    sentence_index_ = kSentenceSize;
    for (int i = 0; i < kPrecalc; ++i) {
      NextExample(&precalc_examples_[i].input, &precalc_examples_[i].label);
    }
  }

  void Compute(OpKernelContext* ctx) override {
    Tensor words_per_epoch(DT_INT64, TensorShape({}));
    Tensor current_epoch(DT_INT32, TensorShape({}));
    Tensor total_words_processed(DT_INT64, TensorShape({}));
    Tensor examples(DT_INT32, TensorShape({batch_size_}));
    auto Texamples = examples.flat<int32>();
    Tensor labels(DT_INT32, TensorShape({batch_size_}));
    auto Tlabels = labels.flat<int32>();
    {
      mutex_lock l(mu_);
      for (int i = 0; i < batch_size_; ++i) {         Texamples(i) = precalc_examples_[precalc_index_].input;         Tlabels(i) = precalc_examples_[precalc_index_].label;         precalc_index_++;         if (precalc_index_ >= kPrecalc) {
          precalc_index_ = 0;
          for (int j = 0; j < kPrecalc; ++j) {
            NextExample(&precalc_examples_[j].input,
                        &precalc_examples_[j].label);
          }
        }
      }
      words_per_epoch.scalar<int64>()() = corpus_size_;
      current_epoch.scalar<int32>()() = current_epoch_;
      total_words_processed.scalar<int64>()() = total_words_processed_;
    }
    ctx->set_output(0, word_);
    ctx->set_output(1, freq_);
    ctx->set_output(2, words_per_epoch);
    ctx->set_output(3, current_epoch);
    ctx->set_output(4, total_words_processed);
    ctx->set_output(5, examples);
    ctx->set_output(6, labels);
  }

 private:
  struct Example {
    int32 input;
    int32 label;
  };

  int32 batch_size_ = 0;
  int32 window_size_ = 5;
  float subsample_ = 1e-3;
  int min_count_ = 5;
  int32 vocab_size_ = 0;
  Tensor word_;
  Tensor freq_;
  int64 corpus_size_ = 0;
  std::vector<int32> corpus_;
  std::vector<Example> precalc_examples_;
  int precalc_index_ = 0;
  std::vector<int32> sentence_;
  int sentence_index_ = 0;

  mutex mu_;
  random::PhiloxRandom philox_ GUARDED_BY(mu_);
  random::SimplePhilox rng_ GUARDED_BY(mu_);
  int32 current_epoch_ GUARDED_BY(mu_) = -1;
  int64 total_words_processed_ GUARDED_BY(mu_) = 0;
  int32 example_pos_ GUARDED_BY(mu_);
  int32 label_pos_ GUARDED_BY(mu_);
  int32 label_limit_ GUARDED_BY(mu_);

  void NextExample(int32* example, int32* label) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
    ...
  }

  Status Init(Env* env, const string& filename) {
    ...
    return Status::OK();
  }
};

REGISTER_KERNEL_BUILDER(Name("SkipgramWord2vec").Device(DEVICE_CPU), SkipgramWord2vecOp);

@word2vec_ops.cc

namespace tensorflow {

REGISTER_OP("SkipgramWord2vec")
    .Output("vocab_word: string")
    .Output("vocab_freq: int32")
    .Output("words_per_epoch: int64")
    .Output("current_epoch: int32")
    .Output("total_words_processed: int64")
    .Output("examples: int32")
    .Output("labels: int32")
    .SetIsStateful()
    .Attr("filename: string")
    .Attr("batch_size: int")
    .Attr("window_size: int = 5")
    .Attr("min_count: int = 5")
    .Attr("subsample: float = 1e-3")
    .Doc(R"doc(...)doc");

REGISTER_OP("NegTrainWord2vec")
...
} // end namespace tensorflow

To compile,

TF_INC=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
g++ -std=c++11 -shared word2vec_ops.cc word2vec_kernels.cc -o word2vec_ops.so -fPIC -I $TF_INC -O2 -D_GLIBCXX_USE_CXX11_ABI=0

@word2vec_optimized.py, the word2vec C++ can be used as follows,

import tensorflow as tf 

word2vec = tf.load_op_library(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'word2vec_ops.so'))

(words, counts, words_per_epoch, current_epoch, total_words_processed,
     examples, labels) = word2vec.skipgram_word2vec(filename=opts.train_data,
                                                    batch_size=opts.batch_size,
                                                    window_size=opts.window_size,
                                                    min_count=opts.min_count,
                                                    subsample=opts.subsample)
(opts.vocab_words, opts.vocab_counts,
opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch])

The question is why we can use “skipgram_word2vec”.
The answer is in the page [1] about naming in the tensorflow wrapper.

For example,

REGISTER_OP("StringToNumber")
    .Input("string_tensor: string")
    .Output("output: out_type")
    .Attr("out_type: {float, int32} = DT_FLOAT");
    .Doc(R"doc(
Converts each string in the input Tensor to the specified numeric type.
)doc");

“A note on naming: Inputs, outputs, and attrs generally should be given snake_case names. The one exception is attrs that are used as the type of an input or in the type of an input. Those attrs can be inferred when the op is added to the graph and so don’t appear in the op’s function. For example, this last definition of ZeroOut will generate a Python function that looks like:”

def string_to_number(string_tensor, out_type=None, name=None):
  """Converts each string in the input Tensor to the specified numeric type.

  Args:
    string_tensor: A `Tensor` of type `string`.
    out_type: An optional `tf.DType` from: `tf.float32, tf.int32`.
      Defaults to `tf.float32`.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of type `out_type`.
  """

Therefore, we have the following lines for skipgram_word2vec

REGISTER_OP("SkipgramWord2vec")
    .Output("vocab_word: string")
    ....

will generate
def skipgram_word2vec(...):
  """...

  Args:
     ...
  Returns:
    ...
  """

Acknowledgement:
The code is from https://github.com/zhuangh/models/commits/master/tutorials/embedding. Thanks to the upstream tensorflow/models. And nealwu‘s help at github https://github.com/tensorflow/models/issues/1094

There is also a Chinese blog¬†talking about Tensorflow’s OPs.

References:
[1] Tensorflow Document https://www.tensorflow.org/extend/adding_an_op#building_the_op_library
[2]
https://github.com/zhuangh/models/tree/master/tutorials/embedding

Advertisements

[OJ report] binary tree right side view

https://leetcode.com/problems/binary-tree-right-side-view/

The solution is at solution @ github .

The idea is to use DFS to do pre-order traversal of the mirror tree. At the same time, use an array to mark current level, so that we can keep from putting wrong element into the result vector.

class Solution {
public:
   void Helper( TreeNode * root, vector<int> & result,
                    int cur, vector<int> & marked ){
     if(root == NULL) return;
    // pre-order of the mirror binary tree
    // (DFS & just by swapping the traversal order)
    if( result.size() == cur ){
      // maintain a marked vector to keep from putting
      // wrong element into the result vector
      result.push_back(root->val);
    }
    Helper(root->right, result , cur+1, marked);
    Helper(root->left , result , cur+1, marked);
   }
  vector<int> rightSideView(TreeNode * root){
    vector<int> result, marked;
    if(root == NULL) return result;
    Helper(root, result, 0, marked);
    return result;
  }
};

The runtime complexity is O(n), where n is the node number.

The worst-case memory complexity is O(n) = O(n) .