Source code for renn.data.synthetic

# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Synthetic Datasets."""

import numpy as np
from itertools import product
from renn import utils

__all__ = ['Unordered']


def constant_sampler(value):
  """Returns a sampling function which always returns the value 'value'"""

  def sample(num_samples):
    return np.full((num_samples,), value)

  max_length = value

  return sample, max_length


def uniform_sampler(min_val, max_val):
  """returns a sampling function which samples uniformly between min_val and
  max_val, inclusive"""

  def sample(num_samples):
    return np.random.randint(min_val, max_val + 1, size=(num_samples,))

  max_length = max_val

  return sample, max_length


def build_vocab(valences=None, num_classes=3):
  """ Builds the vocabulary
  Vocabulary for this dataset consists of tuples, e.g., ('very', 3),
    indicating in this case a token which provides strong evidence of class 3.
  """

  if valences is None:
    valences = {
        'strongly_favor': 2,
        'favor': 1,
        'neutral': 0,
        'against': -1,
        'strongly_against': -2
    }

  words = product(valences, range(num_classes))

  def _score(word):
    """Converts a word like ('very', 1) to a
    vector-valued score, in this case (0,2,0,...)"""
    score = np.zeros(num_classes)
    score[word[1]] = valences[word[0]]
    return score

  vocab = {i: _score(word) for i, word in enumerate(words)}
  return vocab


[docs]class Unordered: """Synthetic dataset representing un-ordered classes, to mimic e.g. text-classification datasets like AG News (unlike, say, star-prediction or sentiment analysis, which features ordered classes""" def __init__(self, num_classes=3, batch_size=64, length_sampler='Constant', sampler_params={'value': 40}): SAMPLERS = {'Constant': constant_sampler, 'Uniform': uniform_sampler} self.num_classes = num_classes self.batch_size = batch_size if length_sampler in SAMPLERS.keys(): self.sampler, self.max_len = SAMPLERS[length_sampler](**sampler_params) else: raise ValueError(f'length_sampler must be one of {SAMPLERS.keys()}') self.vocab = build_vocab(num_classes=num_classes) def __iter__(self): return self def __next__(self): """ Samples and returns a batch. As with the real datasets, batch is a dictionary containing 'inputs', 'labels', and 'index' keys. 'index' specifies the length of the sequence """ lengths = self.sampler(num_samples=self.batch_size) max_length = max(lengths) batch = { 'inputs': np.random.randint(len(self.vocab), size=(self.batch_size, max_length)), 'index': lengths } batch['labels'] = self.label_batch(batch) return batch
[docs] def label_batch(self, batch): """ Calculates class labels for a batch of sentences """ zipped = zip(batch['inputs'], batch['index']) class_scores = np.array([self.score(s, l) for s, l in zipped]) return np.argmax(class_scores, axis=1)
[docs] def score(self, sentence, length): """ Calculates the score, i.e. the amount of accumulated evidence in the sentence, for each class""" return sum([self.vocab[word] for word in sentence[:length]])