Source code for renn.data.synthetic

# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Synthetic Datasets."""

import numpy as np
from itertools import product
from renn import utils

__all__ = ['Unordered']


def constant_sampler(value):
  """Returns a sampling function which always returns the value 'value'"""

  def sample(num_samples):
    return np.full((num_samples,), value)

  max_length = value

  return sample, max_length


def uniform_sampler(min_val, max_val):
  """returns a sampling function which samples uniformly between min_val and
  max_val, inclusive"""

  def sample(num_samples):
    return np.random.randint(min_val, max_val + 1, size=(num_samples,))

  max_length = max_val

  return sample, max_length


def build_vocab(valences=None, num_classes=3):
  """ Builds the vocabulary
  Vocabulary for this dataset consists of tuples, e.g., ('very', 3),
    indicating in this case a token which provides strong evidence of class 3.
  """

  if valences is None:
    valences = {
        'strongly_favor': 2,
        'favor': 1,
        'neutral': 0,
        'against': -1,
        'strongly_against': -2
    }

  words = product(valences, range(num_classes))

  def _score(word):
    """Converts a word like ('very', 1) to a
    vector-valued score, in this case (0,2,0,...)"""
    score = np.zeros(num_classes)
    score[word[1]] = valences[word[0]]
    return score

  vocab = {i: _score(word) for i, word in enumerate(words)}
  return vocab


[docs]class Unordered:
  """Synthetic dataset representing un-ordered classes, to mimic e.g.
  text-classification datasets like AG News (unlike, say, star-prediction or
  sentiment analysis, which features ordered classes"""

  def __init__(self,
               num_classes=3,
               batch_size=64,
               length_sampler='Constant',
               sampler_params={'value': 40}):

    SAMPLERS = {'Constant': constant_sampler, 'Uniform': uniform_sampler}

    self.num_classes = num_classes
    self.batch_size = batch_size

    if length_sampler in SAMPLERS.keys():
      self.sampler, self.max_len = SAMPLERS[length_sampler](**sampler_params)
    else:
      raise ValueError(f'length_sampler must be one of {SAMPLERS.keys()}')

    self.vocab = build_vocab(num_classes=num_classes)

  def __iter__(self):
    return self

  def __next__(self):
    """ Samples and returns a batch.  As with the real datasets,
    batch is a dictionary containing 'inputs', 'labels', and 'index' keys.
    'index' specifies the length of the sequence """
    lengths = self.sampler(num_samples=self.batch_size)
    max_length = max(lengths)

    batch = {
        'inputs':
            np.random.randint(len(self.vocab),
                              size=(self.batch_size, max_length)),
        'index':
            lengths
    }

    batch['labels'] = self.label_batch(batch)

    return batch

[docs]  def label_batch(self, batch):
    """ Calculates class labels for a batch of sentences """
    zipped = zip(batch['inputs'], batch['index'])

    class_scores = np.array([self.score(s, l) for s, l in zipped])
    return np.argmax(class_scores, axis=1)

[docs]  def score(self, sentence, length):
    """ Calculates the score, i.e. the amount of accumulated
    evidence in the sentence, for each class"""
    return sum([self.vocab[word] for word in sentence[:length]])