Code Coverage |
||||||||||
Classes and Traits |
Functions and Methods |
Lines |
||||||||
Total | |
0.00% |
0 / 1 |
|
33.33% |
1 / 3 |
CRAP | |
92.68% |
76 / 82 |
ComplementNaiveBayes | |
0.00% |
0 / 1 |
|
33.33% |
1 / 3 |
12.06 | |
92.68% |
76 / 82 |
__construct( DataSourceInterface $dataSource, ModelInterface $model = null, Document\NormalizerInterface $documentNormalizer = null, TokenizerInterface $tokenizer = null, Token\NormalizerInterface $tokenNormalizer = null ) | |
100.00% |
1 / 1 |
4 | |
100.00% |
6 / 6 |
|||
prepareModel() | |
0.00% |
0 / 1 |
1.00 | |
94.23% |
49 / 52 |
|||
classify($document) | |
0.00% |
0 / 1 |
7.10 | |
87.50% |
21 / 24 |
<?php | |
/** | |
* This file is part of the Statistical Classifier package. | |
* | |
* (c) Cam Spiers <camspiers@gmail.com> | |
* | |
* For the full copyright and license information, please view the LICENSE | |
* file that was distributed with this source code. | |
*/ | |
namespace Camspiers\StatisticalClassifier\Classifier; | |
use Camspiers\StatisticalClassifier\DataSource\DataSourceInterface; | |
use Camspiers\StatisticalClassifier\Model\Model; | |
use Camspiers\StatisticalClassifier\Model\ModelInterface; | |
use Camspiers\StatisticalClassifier\Normalizer\Document; | |
use Camspiers\StatisticalClassifier\Normalizer\Token; | |
use Camspiers\StatisticalClassifier\Tokenizer\TokenizerInterface; | |
use Camspiers\StatisticalClassifier\Tokenizer\Word; | |
use Camspiers\StatisticalClassifier\Transform; | |
/** | |
* An implementation of a Naive Bayes classifier. | |
* | |
* This classifier is based off *Tackling the Poor Assumptions of Naive Bayes Text Classifiers* by Jason Rennie | |
* @author Cam Spiers <camspiers@gmail.com> | |
* @package Statistical Classifier | |
*/ | |
class ComplementNaiveBayes extends Classifier | |
{ | |
/** | |
* Tokenizer (the way of breaking up documents) | |
* @var TokenizerInterface | |
*/ | |
protected $tokenizer; | |
/** | |
* Takes document and makes it consistent | |
* @var Document\NormalizerInterface | |
*/ | |
protected $documentNormalizer; | |
/** | |
* Takes tokenized data and makes it consistent or stem it | |
* @var Token\NormalizerInterface | |
*/ | |
protected $tokenNormalizer; | |
/** | |
* Create the Naive Bayes Classifier | |
* @param DataSourceInterface $dataSource | |
* @param ModelInterface $model An model to store data in | |
* @param Document\NormalizerInterface $documentNormalizer The normalizer to make document consistent | |
* @param TokenizerInterface $tokenizer The tokenizer to break up the documents | |
* @param Token\NormalizerInterface $tokenNormalizer The normaizer to make tokens consistent | |
*/ | |
public function __construct( | |
DataSourceInterface $dataSource, | |
ModelInterface $model = null, | |
Document\NormalizerInterface $documentNormalizer = null, | |
TokenizerInterface $tokenizer = null, | |
Token\NormalizerInterface $tokenNormalizer = null | |
) { | |
$this->dataSource = $dataSource; | |
$this->model = $model ?: new Model(); | |
$this->documentNormalizer = $documentNormalizer ?: new Document\Lowercase(); | |
$this->tokenizer = $tokenizer ?: new Word(); | |
$this->tokenNormalizer = $tokenNormalizer; | |
} | |
/** | |
* @inheritdoc | |
*/ | |
public function prepareModel() | |
{ | |
$data = $this->applyTransform( | |
new Transform\TokenPreparation( | |
$this->tokenizer, | |
$this->documentNormalizer, | |
$this->tokenNormalizer | |
), | |
$this->dataSource->getData() | |
); | |
$tokenCountByDocument = $this->applyTransform( | |
new Transform\TokenCountByDocument(), | |
$data | |
); | |
$documentCount = $this->applyTransform( | |
new Transform\DocumentCount(), | |
$data | |
); | |
unset($data); | |
$tokenAppearanceCount = $this->applyTransform( | |
new Transform\TokenAppearanceCount(), | |
$tokenCountByDocument | |
); | |
$tokensByCateory = $this->applyTransform( | |
new Transform\TokensByCategory(), | |
$tokenCountByDocument | |
); | |
$tfidf = $this->applyTransform( | |
new Transform\TFIDF(), | |
$tokenCountByDocument, | |
$documentCount, | |
$tokenAppearanceCount | |
); | |
unset($tokenCountByDocument); | |
unset($tokenAppearanceCount); | |
$documentLength = $this->applyTransform( | |
new Transform\DocumentLength(), | |
$tfidf | |
); | |
unset($tfidf); | |
$documentTokenCounts = $this->applyTransform( | |
new Transform\DocumentTokenCounts(), | |
$documentLength | |
); | |
$complement = $this->applyTransform( | |
new Transform\Complement(), | |
$documentLength, | |
$tokensByCateory, | |
$documentCount, | |
$documentTokenCounts | |
); | |
unset( | |
$documentLength, | |
$tokensByCateory, | |
$documentCount, | |
$documentTokenCounts | |
); | |
$this->model->setModel( | |
$this->applyTransform( | |
new Transform\Weight(), | |
$complement | |
) | |
); | |
$this->model->setPrepared(true); | |
} | |
/** | |
* @inheritdoc | |
*/ | |
public function classify($document) | |
{ | |
$results = array(); | |
if ($this->documentNormalizer) { | |
$document = $this->documentNormalizer->normalize($document); | |
} | |
$tokens = $this->tokenizer->tokenize($document); | |
if ($this->tokenNormalizer) { | |
$tokens = $this->tokenNormalizer->normalize($tokens); | |
} | |
$tokens = array_count_values($tokens); | |
$weights = $this->preparedModel()->getModel(); | |
foreach (array_keys($weights) as $category) { | |
$results[$category] = 0; | |
foreach ($tokens as $token => $count) { | |
if (array_key_exists($token, $weights[$category])) { | |
$results[$category] += $count * $weights[$category][$token]; | |
} | |
} | |
} | |
asort($results, SORT_NUMERIC); | |
$category = key($results); | |
$value = array_shift($results); | |
if ($value === array_shift($results)) { | |
return false; | |
} else { | |
return $category; | |
} | |
} | |
} |