Code Coverage
 
Classes and Traits
Functions and Methods
Lines
Total
0.00%
0 / 1
0.00%
0 / 7
CRAP
0.00%
0 / 178
SVM
0.00%
0 / 1
0.00%
0 / 7
1260
0.00%
0 / 178
 __construct( DataSourceInterface $dataSource, SVMModel $model = null, Document\NormalizerInterface $documentNormalizer = null, TokenizerInterface $tokenizer = null, Token\NormalizerInterface $tokenNormalizer = null, \SVM $svm = null, $threshold = null )
0.00%
0 / 1
42
0.00%
0 / 27
 prepareModel()
0.00%
0 / 1
240
0.00%
0 / 87
 classify($document)
0.00%
0 / 1
12
0.00%
0 / 14
 prepareDocument($document, SVMModel $model)
0.00%
0 / 1
30
0.00%
0 / 20
 setThreshold($threshold)
0.00%
0 / 1
12
0.00%
0 / 16
 getProbabilities($document)
0.00%
0 / 1
6
0.00%
0 / 10
 hasThreshold()
0.00%
0 / 1
2
0.00%
0 / 4
<?php
/**
* This file is part of the Statistical Classifier package.
*
* (c) Cam Spiers <camspiers@gmail.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Camspiers\StatisticalClassifier\Classifier;
use Camspiers\StatisticalClassifier\DataSource\DataSourceInterface;
use Camspiers\StatisticalClassifier\Model\SVMModel;
use Camspiers\StatisticalClassifier\Normalizer\Document;
use Camspiers\StatisticalClassifier\Normalizer\Token;
use Camspiers\StatisticalClassifier\Tokenizer\TokenizerInterface;
use Camspiers\StatisticalClassifier\Tokenizer\Word;
use Camspiers\StatisticalClassifier\Transform;
/**
* Provides a text based SVM classifier which uses libsvm
*
* @author Cam Spiers <camspiers@gmail.com>
* @package Statistical Classifier
*/
class SVM extends Classifier
{
/**
* Tokenizer (the way of breaking up documents)
* @var TokenizerInterface
*/
protected $tokenizer;
/**
* Takes document and makes it consistent
* @var Document\NormalizerInterface
*/
protected $documentNormalizer;
/**
* Takes tokenized data and makes it consistent or stem it
* @var Token\NormalizerInterface
*/
protected $tokenNormalizer;
/**
*
* @var float|bool
*/
protected $threshold;
/**
* @param DataSourceInterface $dataSource
* @param SVMModel $model
* @param Document\NormalizerInterface $documentNormalizer
* @param TokenizerInterface $tokenizer
* @param Token\NormalizerInterface $tokenNormalizer
* @param \SVM $svm
* @param null $threshold
*/
public function __construct(
DataSourceInterface $dataSource,
SVMModel $model = null,
Document\NormalizerInterface $documentNormalizer = null,
TokenizerInterface $tokenizer = null,
Token\NormalizerInterface $tokenNormalizer = null,
\SVM $svm = null,
$threshold = null
) {
$this->dataSource = $dataSource;
$this->model = $model ? : new SVMModel();
$this->documentNormalizer = $documentNormalizer ?: new Document\Lowercase();
$this->tokenizer = $tokenizer ?: new Word();
$this->tokenNormalizer = $tokenNormalizer;
if (!$svm) {
$svm = new \SVM();
$svm->setOptions(
array(
\SVM::OPT_KERNEL_TYPE => \SVM::KERNEL_LINEAR
)
);
}
$this->svm = $svm;
if ($threshold) {
$this->setThreshold($threshold);
}
}
/**
* @inheritdoc
*/
public function prepareModel()
{
$data = $this->applyTransform(
new Transform\TokenPreparation(
$this->tokenizer,
$this->documentNormalizer,
$this->tokenNormalizer
),
$this->dataSource->getData()
);
$tokenCountByDocument = $this->applyTransform(
new Transform\TokenCountByDocument(),
$data
);
$documentLength = $this->applyTransform(
new Transform\DocumentLength(),
$this->applyTransform(
new Transform\TFIDF(),
$tokenCountByDocument,
$this->applyTransform(
new Transform\DocumentCount(),
$data
),
$this->applyTransform(
new Transform\TokenAppearanceCount(),
$tokenCountByDocument
)
)
);
$categoryMap = array();
$categoryCount = 0;
$tokenMap = array();
$tokenCount = 1;
// Produce the token and category maps for the whole document set
foreach ($documentLength as $category => $documents) {
if (!array_key_exists($category, $categoryMap)) {
$categoryMap[$category] = $categoryCount;
$categoryCount++;
}
foreach ($documents as $document) {
foreach (array_keys($document) as $token) {
if (!array_key_exists($token, $tokenMap)) {
$tokenMap[$token] = $tokenCount;
$tokenCount++;
}
}
}
}
// When using probabilities and our dataset is small we need to increase its
// size by duplicating the data
// see: http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf section "8 Probability Estimates"
if ($this->hasThreshold()) {
foreach ($documentLength as $category => $documents) {
while (count($documents) <= 5) {
foreach ($documents as $document) {
$documents[] = $document;
}
}
$documentLength[$category] = $documents;
}
}
$transform = array();
// Prep the svm data set for use
foreach ($documentLength as $category => $documents) {
foreach ($documents as $document) {
$entry = array(
$categoryMap[$category]
);
foreach ($document as $token => $value) {
$entry[$tokenMap[$token]] = $value;
}
ksort($entry, SORT_NUMERIC);
$transform[] = $entry;
}
}
// Weight the data set by the number of docs that appear in each class.
$weights = array();
foreach ($documentLength as $category => $documents) {
$weights[$categoryMap[$category]] = count($documents);
}
$lowest = min($weights);
foreach ($weights as $index => $weight) {
$weights[$index] = $lowest / $weight;
}
$this->model->setMaps(array_flip($categoryMap), $tokenMap);
$this->model->setModel(
$this->svm->train(
$transform,
$weights
)
);
$this->model->setPrepared(true);
}
/**
* @inheritdoc
*/
public function classify($document)
{
/** @var SVMModel $model */
$model = $this->preparedModel();
$categoryMap = $model->getCategoryMap();
$data = $this->prepareDocument($document, $model);
if ($this->hasThreshold()) {
$probabilities = array();
$category = $model->getModel()->predict_probability($data, $probabilities);
return $probabilities[$category] > $this->threshold ? $categoryMap[$category] : false;
} else {
$category = $model->getModel()->predict($data);
return $categoryMap[$category];
}
}
/**
* Formats the document for use in \SVMModel
* @param string $document
* @param \Camspiers\StatisticalClassifier\Model\SVMModel $model
* @return array
*/
protected function prepareDocument($document, SVMModel $model)
{
$tokenMap = $model->getTokenMap();
$data = array();
if ($this->documentNormalizer) {
$document = $this->documentNormalizer->normalize($document);
}
$tokens = $this->tokenizer->tokenize($document);
if ($this->tokenNormalizer) {
$tokens = $this->tokenNormalizer->normalize($tokens);
}
$tokenCounts = array_count_values($tokens);
foreach ($tokenCounts as $token => $value) {
if (isset($tokenMap[$token])) {
$data[$tokenMap[$token]] = $value;
}
}
ksort($data, SORT_NUMERIC);
return $data;
}
/**
* Set the threshold probability a classifier document must meet
* @param float $threshold float value between 0-1
* @throws \InvalidArgumentException
*/
public function setThreshold($threshold)
{
if (is_numeric($threshold)) {
$this->threshold = $threshold;
$this->svm->setOptions(
array(
\SVM::OPT_PROBABILITY => true
)
);
if ($this->model->isPrepared()) {
$this->model->setPrepared(false);
}
} else {
throw new \InvalidArgumentException("Threshold must be a float value between 0-1");
}
}
/**
* Returns the probabilities of the document being in each category
* @param string $document
* @return array
*/
public function getProbabilities($document)
{
if ($this->hasThreshold()) {
$model = $this->preparedModel();
$data = $this->prepareDocument($document, $model);
$probabilities = array();
$model->getModel()->predict_probability($data, $probabilities);
return array_combine($model->getCategoryMap(), $probabilities);
}
}
/**
* @return bool
*/
protected function hasThreshold()
{
return $this->threshold !== null;
}
}