tokenizer.phps
#!/usr/bin/php
<?
// Reads all the files in the user-specified directory and attempts to extract
// all of the words that exist within them, and dump a list of all the unique ones found.
$dir = $argv[1];
ini_set ( "memory_limit", "600M");
$tokenlist = " \\/.\"#',?=&!:-|<>()[]@~\n";
if ($handle = opendir($dir)) {
while (false !== ($file = readdir($handle))) {
if(is_file($dir.$file))
{
$str = file_get_contents($dir.$file);
$tokens = strtok($str, $tokenlist);
while($tokens !== false)
{
$keys[$tokens]++;
$tokens = strtok($tokenlist);
}
}
}
closedir($handle);
foreach(array_keys($keys) as $key)
{
echo $key."\n";
}
}
?>