In this blog, I focus on explaining the calculations of “f” and “g” in word2vec C code:

else
 {
 // the resulting f is a kind of position
 f = (f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2);
 f = expTable[(int) f];
 }
 // 'g' is the gradient multiplied by the learning rate
 double g = (1 - word.codeArr[i] - f) * alpha;

For “f”:
It uses a speed-up method to compute the sigmod function:

for (i = 0; i < EXP_TABLE_SIZE; i++) {
 expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP);
 expTable[i] = expTable[i] / (expTable[i] + 1);
 }

Actually, this solution is very simple to understand.

Easy to find that sigmoid(x) values almost keep unchanged when x is too large or too small. Let’s assume sigma(x)=1 when x>MAX_EXP, and sigma(x)=0 when x<-MAX_EXP. The code splits the range [-MAX_EXP, MAX_EXP] into EXP_TABLE_SIZE pieces. In each piece, we assume the sigma(x) values are the same.

So, in the initialization of expTable, when i=0, that equals to we choose the first piece, where original x=-MAX_EXP; similarly, when i=EXP_TABLE_SIZE-1, the last piece is selected.

Based on this conversion, each time when we get the raw “f” value, we only need to figure out which piece this value should be located. That is implemented by

f = (f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2);

Namely, each piece enjoys $\frac{2*MAX\_EXP}{EXP\_TABLE\_SIZE}$ , and the distance of raw “f” to the left border “-MAX_EXP” is $f-(-MAX\_EXP)=f+MAX\_EXP$ . Hence, the index of target piece should be $\frac{f+MAX\_EXP}{\frac{2*MAX\_EXP}{EXP\_TABLE\_SIZE}}=(f + MAX\_EXP) * (EXP\_TABLE\_SIZE / MAX\_EXP / 2)$ .

For g:

As my another blog: https://yinwenpeng.wordpress.com/2013/09/26/hierarchical-softmax-in-neural-network-language-model/ said, sigma(x) in hierarchical softmax is: $\sigma([n(w, j+1)=ch(n(w,j))]\cdot {v_{n(w,j)}^{'}}^Tv_{w_I})$ . Let’s denote ${v_{n(w,j)}^{'}}^Tv_{w_I}$ as “simi”, then sigma(simi) can be re-write as $\sigma(simi)=\frac{e^{(1-code)*simi}}{1+e^{simi}}$ .

How to interpret above formular?

when code==1, $\sigma(simi)=\frac{1}{1+e^{simi}}$ ; when code==0, $\sigma(simi)=\frac{e^{simi}}{1+e^{simi}}$ . The sum is 1. It’s right.

We usually find the derivative of sigma(simi) after giving it a log(.). So, $ln\sigma(simi)=(1-code)*simi-ln(1+e^{simi})$ . Its derivative over “simi” is $(1-code)-\frac{e^{simi}}{1+e^{simi}}=1-code-f$ (note that the final “f” is checked from the expTable.)

// Create binary Huffman tree using the word counts
// Frequent words will have short unique binary codes
void CreateBinaryTree() {
  // min1i is the first son, and min2i is the second son; point[] is used to store the index of all father nodes
  long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
  char code[MAX_CODE_LENGTH];  //code is 0-1 sequence
  long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
  long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
  long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
  //initialize count[], left part stores the occur times of words, and the right part is initialized by infinite
  for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
  for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
  // pos1 scans from the end of the left part of "count[]" to its beginning, while pos2 scans from the beginning of the second part of "count[]"
  pos1 = vocab_size - 1;
  pos2 = vocab_size;
  // Following algorithm constructs the Huffman tree by adding one node at a time
  //following a step within the second part of the count vector
  //in following for loop, "a" is a pointer scanning the second part of "count[]", because the non-leaf nodes is "one" less than the leaf nodes, hence "a" only needs to 
  //scan "vocab_size-1" entries.
  for (a = 0; a < vocab_size - 1; a++) {
    // next, find two smallest nodes 'min1i, min2i'. First, find the min1i
    if (pos1 >= 0) //if pos1 has not passed the left border of "count[]"
    {
      if (count[pos1] < count[pos2]) {
        min1i = pos1;
        pos1--;
      } else {
        min1i = pos2;
        pos2++;
      }
    } else {
      min1i = pos2;
      pos2++;
    }
    //second,  find the min2i
    if (pos1 >= 0)
    {
      if (count[pos1] < count[pos2]) {
        min2i = pos1;
        pos1--;
      } else {
        min2i = pos2;
        pos2++;
      }
    } else {
      min2i = pos2;
      pos2++;
    }
    //already found the two sons, add their counts as their father's count
    count[vocab_size + a] = count[min1i] + count[min2i];
    //record their father's position in "count[]"
    parent_node[min1i] = vocab_size + a;
    parent_node[min2i] = vocab_size + a;
    // let the code choosing the second son is "1", the first son is naturally "0" for the initialization of "binary[]"
    binary[min2i] = 1;
  }
  // Now assign binary code to each vocabulary word "a"
  for (a = 0; a < vocab_size; a++)
  {
    b = a; // "b" is used to find its father, starting from itself
    i = 0; // "i" is for counting the number of its ancestors, namely the length of its code
    //find all ancestors by down-up style
    while (1)
    {
      code[i] = binary[b];
      point[i] = b;//point[] stores the index of its ancestors from itself to the highest (not consider the root)
      i++;
      b = parent_node[b];
      //if "b" meets the root, over. because there are only vocab_size-1 non-leaf nodes, hence the index of root is vocab_size * 2 - 2 in "count[]"
      if (b == vocab_size * 2 - 2) break;
    }//while outputs the temporary code[] and point[], both are down-up style
    vocab[a].codelen = i; //i is the length of code
    //next, convert above temporary code[] and point[] inversely. First, let the first ancestor be the root (index is vocab_size-2)
    vocab[a].point[0] = vocab_size - 2;
    for (b = 0; b < i; b++)
    {
      vocab[a].code[i - b - 1] = code[b];
      //note that point[0] is the index of the word itself, so "point[0] - vocab_size" is surely a negative number. Putting this negative number into the       
      // "vocab[a].point[i]" (where "i" is the code length) as a flag.
      vocab[a].point[i - b] = point[b] - vocab_size; 
    }
  }
  free(count);
  free(binary);
  free(parent_node);
}

Training process:

//train the cbow architecture
if (cbow) {  
      // in -> hidden, 
      //namely sum all the embedding of context words by element-wise style, produce the information of hidden layer "neul[]"
      for (a = b; a < window * 2 + 1 - b; a++) if (a != window)
      {
        c = sentence_position - window + a;
        if (c < 0) continue;
        if (c >= sentence_length) continue;
        last_word = sen[c];//here, last_word should be a context word
        if (last_word == -1) continue;
        //sum the context words' embeddings
        for (c = 0; c < layer1_size; c++)
        	neu1[c] += syn0[c + last_word * layer1_size];
      }
      //in hierarchical softmax, each target word is replaced by its ancestors for computation. Namely, use each father's feature vector to 
      //compute with the sum of context words. Father's feature vector is also the weight vector from hidden layer to output layer.
      if (hs) for (d = 0; d < vocab[word].codelen; d++)// for each ancestors
      {
        f = 0;
        //vocab[word].point[d] is the index of a father in the synl matrix
        l2 = vocab[word].point[d] * layer1_size;  //l2 is the beginning position of that father's feature vector


        // Propagate hidden -> output,    
        //use the fathers' information "syn1[c + l2]" to compute with the sum of context words' embedding "neu1[c]"
        for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
        if (f <= -MAX_EXP) continue;
        else if (f >= MAX_EXP) continue;
        else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
        // convert "f" to "g". 'g' is the gradient multiplied by the learning rate
        g = (1 - vocab[word].code[d] - f) * alpha;


        // Propagate errors output -> hidden
        for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
        // Learn weights hidden -> output, weights is also the father's feature vector
        for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
      }
      // NEGATIVE SAMPLING 取负例
      if (negative > 0) for (d = 0; d < negative + 1; d++) {//一个正例,negative个负例
        if (d == 0) {//正例为自己
          target = word;
          label = 1;
        } else {
          next_random = next_random * (unsigned long long)25214903917 + 11;
          target = table[(next_random >> 16) % table_size];
          if (target == 0) target = next_random % (vocab_size - 1) + 1;
          if (target == word) continue;
          label = 0;
        }
        l2 = target * layer1_size;
        f = 0;
        for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
        if (f > MAX_EXP) g = (label - 1) * alpha;
        else if (f < -MAX_EXP) g = (label - 0) * alpha;
        else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
        for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
        for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
      }
      // hidden -> in
      for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
        c = sentence_position - window + a;
        if (c < 0) continue;
        if (c >= sentence_length) continue;
        last_word = sen[c];
        if (last_word == -1) continue;
        //update the embedding of each context word
        for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
      }

	yinwenpeng on Hierarchical Softmax in neural…
	hadigayri on Hierarchical Softmax in neural…
	MEhomer on word2vec implementations of gr…
	yinwenpeng on Hierarchical Softmax in neural…
	tong on Hierarchical Softmax in neural…

Wenpeng Yin's Blog

Search results for: word2vec

word2vec implementations of gradient and expTable

For “f”:
It uses a speed-up method to compute the sigmod function:

comments of word2vec code

For “f”: It uses a speed-up method to compute the sigmod function:

For “f”:
It uses a speed-up method to compute the sigmod function: