| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| /* |
| * Softmax classifier layer. |
| */ |
| |
| forward = function(matrix[double] scores) |
| return (matrix[double] probs) { |
| /* |
| * Computes the forward pass for a softmax classifier. The input |
| * has N examples, each with D values that are interpreted as |
| * unnormalized, log-probabilities for each of D classes. The softmax |
| * function transforms these values to normalized probabilities across |
| * the D classes, for every example. |
| * |
| * This can be interpreted as a generalization of the sigmoid |
| * function to multiple classes. |
| * |
| * `probs_ij = e^scores_ij / sum(e^scores_i)` |
| * |
| * Inputs: |
| * - scores: Inputs, of shape (N, D). |
| * |
| * Outputs: |
| * - probs: Outputs, of shape (N, D). |
| */ |
| # For numerical stability, we subtract the max score of an example from all scores for that |
| # example. This is equivalent to the original formulation: |
| # e^scores_i / sum(e^scores_i) == C*e^scores_i / C*sum(e^scores_i) |
| # == e^(scores_i+log(C)) / sum(e^(scores_i+log(C)) |
| # set log(C) = -max(scores_i): |
| # == e^(scores_i-max(scores_i)) / sum(e^(scores_i-max(scores_i)) |
| scores = scores - rowMaxs(scores) # numerical stability |
| unnorm_probs = exp(scores) # unnormalized probabilities |
| probs = unnorm_probs / rowSums(unnorm_probs) # normalized probabilities |
| } |
| |
| backward = function(matrix[double] dprobs, matrix[double] scores) |
| return (matrix[double] dscores) { |
| /* |
| * Computes the backward pass for a softmax classifier. |
| * |
| * Note that dscores_ij has multiple source branches: |
| * |
| * ``` |
| * dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij) |
| * dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j |
| * |
| * dloss/dscores_ij = |
| * (dloss/dprobs_ij * dprobs_ij/dscores_ij) |
| * + sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij) |
| * ``` |
| * |
| * Inputs: |
| * - dprobs: Gradient wrt `probs` from upstream, of shape (N, D). |
| * - scores: Inputs, of shape (N, D). |
| * |
| * Outputs: |
| * - dscores: Gradient wrt `scores`, of shape (N, D). |
| */ |
| scores = scores - rowMaxs(scores) # numerical stability |
| unnorm_probs = exp(scores) # unnormalized probabilities |
| probs = unnorm_probs / rowSums(unnorm_probs) # normalized probabilities |
| # After some cancellation: |
| # dscores = dprobs*probs - probs*rowSums(dprobs*probs) |
| dtemp = dprobs * probs |
| dscores = dtemp - probs*rowSums(dtemp) |
| } |
| |