#!/usr/bin/perl -w use strict; use vars qw($VERSION); require 5.005; $VERSION = sprintf "%d.%d", q$Revision: 1.48 $ =~ /(\d+)/g; # jhi@iki.fi use Getopt::Long; my $n = 1; my ($help, $debug, $input, $latin1, $output, $tables, $version); my $fold = 1; sub help { print <<EOF; Usage: $0 [options] [files] Compute the ngram frequencies and output the word lengths, ngrams, initials ngrams, medial (non-initial non-final) ngrams, and final ngrams. The frequencies will be shown as absolute, as relative to the total frequency, and as relative to the maximum frequency. Options: --n=N The default is one character ngrams. --nofold The default is to lowercase the ngrams. --input=C The default is to use native 8-bit bytes and native definition of "letters". Using this option requires Perl 5.8. The input will be mapped from C to Unicode, and the Unicode Letter definition will be used for "letters". For example to input ISO 8859-1 characters, use -i=latin1. --latin1 Assume the input to be ISO 8859-1. Does not require Perl 5.8. --output=F Output to files F.ngl, F.ngr, F.ngi, F.ngm, F.ngf. The default is to use STDOUT. --tables Instead of a linear list of ngram frequencies output a list of two-dimensional tables of ngram frequencies (for n > 1). --help Show this help. --version Show version. --debug Show the ngrams in STDERR. The options can be shortened to their unique prefixes and the two dashes to one dash. No files means using STDIN. EOF exit(1); } help() unless GetOptions('n=i' => \$n, 'fold!' => \$fold, 'input=s' => \$input, 'latin1' => \$latin1, 'output=s' => \$output, 'tables' => \$tables, 'help' => \$help, 'version' => \$version, 'debug' => \$debug); help() if $n < 1 || int($n) != $n; sub version { print $VERSION, "\n"; exit(1); } help() if $help; version() if $version; require 5.008 if $input; my %length_freq; my %ngram_freq; my %initial_freq; my %medial_freq; my %final_freq; @ARGV = ("-") unless @ARGV; my $letter = $latin1 ? '[A-Za-z��������������������������������������������������������������]' : $] >= 5.008 ? '\pL' : '\w'; for my $fn (@ARGV) { if (open(my $fh, $fn)) { binmode($fh, ":encoding($input)") if $input; while (<$fh>) { chomp; while (/($letter+)/go) { my $s = $1; my $l = length $s; $length_freq{$l}++; if ($l >= $n) { if ($fold) { if ($latin1) { $s =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZ������������������������������/abcdefghijklmnopqrstuvwxyz������������������������������/; } else { $s = lc $s; } } my @n = (); while ($s =~ /(.{$n})/g) { $ngram_freq{$1}++; push @n, $1; pos($s) -= $n - 1; } print STDERR "@n\n" if $debug; $initial_freq{my $ini = shift @n}++; $final_freq {@n ? pop @n : $ini}++; $medial_freq {$_}++ for @n; } } } } else { warn "$0: Failed to open '$fn' for reading: $!\n"; } } sub summax { my $freq = shift; my $max = 0; my $sum = 0; for my $a (keys %{$freq}) { next unless defined $freq->{$a}; $max = $freq->{$a} if $freq->{$a} > $max; $sum += $freq->{$a}; } return ($sum, $max); } sub table_show { my ($out, $freq, $m, $sum, $max, $n, $s) = @_; if ($n == 2) { print $out length $s ? $s : " "; for my $b (@$m) { print $out "\t$b"; } print $out "\n"; for my $a (@$m) { print $out $a; for my $b (@$m) { my $sab = $s . $a . $b; my $f = $freq->{$sab} || 0; printf $out "\t%6d", $f; } print $out "\n"; } } else { for my $a (@$m) { table_show($out, $freq, $m, $sum, $max, $n - 1, $s . $a); } } } sub show { my ($out, $freq, $k, $m, $title) = @_; print $out "$title Frequencies\n"; my ($sum, $max) = summax($freq); if ($tables && defined $m && $n > 1) { table_show($out, $freq, $m, $sum, $max, $n, ''); } else { for my $a (@{ $k }) { my $f = $freq->{$a} || 0; printf $out "%${n}s\t%6d\t%9.6f\t%.6f\n", $a, $f, $sum ? $f / $sum : 0, $max ? $f / $max : 0; } printf $out "%${n}s\t%6d\t%9.6f\n", "Sum", $sum, 1.0; } } my $outputfh = *STDOUT; binmode(STDOUT, ":encoding($input)") if $input && !defined $output; sub output_open { my $suffix = shift; my $outputfn = "$output.ng$suffix"; if (open($outputfh, ">$outputfn")) { binmode($outputfh, ":encoding($input)") if $input; } else { warn "$0: Failed to open '$outputfn' for writing: $!\n"; } } sub freq_show { my ($outputfh, $freq, $k, $m, $suffix, $title) = @_; output_open($suffix) if defined $output; show($outputfh, $freq, $k, $m, $title); } my @l = sort { $a <=> $b } keys %length_freq; freq_show($outputfh, \%length_freq, @l ? [ 1 .. $l[-1] ] : [], undef, 'l', "Word Length"); my @k = sort keys %ngram_freq; my @m; if ($tables && $n > 1) { my %s; @s{ split // } = () for @k; @m = sort keys %s; } freq_show($outputfh, \%ngram_freq, \@k, \@m, 'r', "Ngram"); freq_show($outputfh, \%initial_freq, \@k, \@m, 'i', "Initial Ngram"); freq_show($outputfh, \%medial_freq, \@k, \@m, 'm', "Medial Ngram"); freq_show($outputfh, \%final_freq, \@k, \@m, 'f', "Final Ngram"); exit(0); __END__ =head1 NAME ngram - compute and display frequencies of ngrams =head1 SYNOPIS ngram [--help] [--n=2] [input files] =head1 DESCRIPTION This script computes and display the ngram frequencies in its input. If no ngram length is specified, 1 (one) is assumed. If no files are specified, STDIN is read. =head2 Sample Output With the input C<banana and bandana> and ngram length of one would get the following output. The initial, medial, and final ngrams refer to I<word-initial>, and so forth -- but note that the script doesn't have artificial intelligence built in: C<doesn't> is two words, as is C<inter-continental>. The C<#> are comments added here for the sake of documentation, they are not part of the real output. Word Length Frequencies 1 0 0.000000 0.000000 2 0 0.000000 0.000000 3 1 0.333333 1.000000 # We had one word of length 3. 4 0 0.000000 0.000000 5 0 0.000000 0.000000 6 1 0.333333 1.000000 7 1 0.333333 1.000000 Sum 3 1.000000 # We had three words total. Ngram Frequencies a 7 0.437500 1.000000 # We had seven letters "a". b 2 0.125000 0.285714 d 2 0.125000 0.285714 n 5 0.312500 0.714286 Sum 16 1.000000 # We had sixteen letters total. Initial Ngram Frequencies a 1 0.333333 0.500000 # We had one word beginning with "a". b 2 0.666667 1.000000 # We had two words beginning with "b". d 0 0.000000 0.000000 # We had no words beginning with "d". n 0 0.000000 0.000000 Sum 3 1.000000 Medial Ngram Frequencies a 4 0.400000 0.800000 b 0 0.000000 0.000000 # We had no "b"s in the middle. d 1 0.100000 0.200000 # We only one "d" in the middle. n 5 0.500000 1.000000 Sum 10 1.000000 Final Ngram Frequencies a 2 0.666667 1.000000 b 0 0.000000 0.000000 # We had no "b"s ending words. d 1 0.333333 0.500000 n 0 0.000000 0.000000 Sum 3 1.000000 The sum of initial, medial, and final ngram frequencies equals the number of all ngrams. =head2 Options --n=N The default is to use one character ngrams, in other words, single characters. With the C<-n> option you can change the ngram length. I<N> must be greater than or equal to one. --nofold The default is to lowercase all the letters before counting the frequencies: for example C<Ab> is lowercased to C<ab>. With the C<--nofold> option this lowercasing is not done. --input=C The default is to use the native 8-bit bytes and the native definition of "letters". With the C<--input> option any character set and encoding recognized by the Encode extension can be used as the input. Using this option unfortunately requires at least Perl 5.8.0, but for a common character set see the C<--latin1> option. --latin1 Assume the input to be in ISO 8859-1 (Latin 1). Using this option does not require Perl 5.8. --output=F Output the resulting ngram frequencies to files C<F.ngl>, C<F.ngr>, C<F.ngi>, C<F.ngm>, C<F.ngf> for the lengths, ngrams, initial ngrams, medial ngrams, and final ngrams, respectively. The default is to output all the results to the standard output, and in a different order (lengths, ngrams, initial ngrams, medial ngrams, final ngrams). --tables Relevant only if I<N> is two or more: usually a linear list of the ngram frequencies is output, but with the C<--table> option a list of two-dimensional tables is shown. This may make it easier to visualize the distribution of the ngrams. The relative frequencies and sums are not shown, only the absolute frequencies. --help Show a concise help message. --version Show version of the F<ngram.pl> script. --debug Output various debugging information. Currently shows the detected ngrams to the standard error output. All the options can be shortened to their unique prefixes, and also the leading C<--> be shortened to a single C<->. =head1 PREREQUISITES Getopt::Long strict vars =head1 COREQUISITES Encode =head1 SCRIPT CATEGORIES Text::Statistics =head1 README Compute ngram (one letter, digram, trigram, ...) frequencies. Useful for generating text using Markov chains or for cryptogeeks. =head1 SEE ALSO Simon Cozen's Text::Ngram module in CPAN =head1 COPYRIGHT (C) 2003 by Jarkko Hietaniemi <jhi@iki.fi> All rights reserved. You may distribute this code under the terms of either the GNU General Public License or the Artistic License, as specified in the Perl README file. =cut