#!/usr/local/bin/perl # ======================================================================== # dict2db - convert text dictionary to BerkeleyDB hash # Andrew Ho (andrew@zeuscat.com) # # This program contains embedded documentation in Perl POD (Plain Old # Documentation) format. Search for the string "=head1" in this document # to find documentation snippets, or use "perldoc" to read it; utilities # like "pod2man" and "pod2html" can reformat as well. # # $Id: dict2db,v 1.1 2010/04/02 05:57:25 andrew Exp $ # ======================================================================== =head1 NAME dict2db - convert text dictionary to BerkeleyDB hash =head1 SYNPOSIS % dict2db [-h] [-v] [-a] [-u|-l] [-p] inputdict outputdb =head1 DESCRIPTION This program reads words from a dictionary, which should have one word per line, as in the typical Unix F or F file, and creates a BerkeleyDB whose keys are the words from the input file, and whose values a true value (1). In addition, the following options are available: =over 4 =item -h Display a usage message and exit. =item -v Enable verbose mode. Print each entry added to the dictionary. =item -a Alphabetic characters only. When writing a BerkeleyDB entry, strip non-alphabetic characters from the words from the input dictionary. The default is to not do any processing. =item -u, -l Normalize all BerkeleyDB keys to uppercase or lowercase. The default is to leave lettercase alone. =item -p Create prefix entries. For each word, also create an entry for the word prefixes, with a false entry. For example, the word APPLE in the input dictionary would result in the following entries in the output BerkeleyDB: A => 0, AP => 0, APP => 0, APPL => 0, APPLE => 1, The default is to not generate prefix entries. =back This program can be used to create a BerkeleyDB file for the Scramble solver program (see L): % dict2db -a -u -p dictionary.txt dictionary.db % scramble_solver -d dictionary.db "abcd efgh ijkl mnop" =head1 COPYRIGHTS This code is copyright (C) 2010 Andrew Ho, commercial rights reserved. =head1 AUTHOR Andrew Ho EFE =cut # ------------------------------------------------------------------------ # Libraries, globals, and constants require 5.6.0; use warnings; use strict; $| = 1; use File::Basename qw(basename); use Getopt::Long qw(GetOptions); use BerkeleyDB qw(DB_CREATE); # User configurable options our $Verbose = 0; our $Prefix = 0; our $Alpha_Only = 0; our $Uppercase = 0; our $Lowercase = 0; our $Input_Dict; our $Output_DB; # Constants for command line help our $ME = basename $0; our $USAGE = "usage: $ME [-h] [-v] [-a] [-u|-l] [-p] inputdict outputdb\n"; our $FULL_USAGE = $USAGE . << "EndUsage"; -h display this help text and exit -v verbose output, show extra output on stderr -a strip characters that are not English letters -u normalize letter case to uppercase -l normalize letter case to lowercase -p add entries for prefixes as well as full words inputdict required input dictionary file outputdb required output BerkeleyDB file EndUsage # ------------------------------------------------------------------------ # Parse command line options, intialize word map from dictionary { # Promote contextless Getopt::Long warnings to nicely formatted die()s local $SIG{__WARN__} = sub { my $errmsg = lcfirst join '', @_; chomp $errmsg; die "$ME: argument parsing error: $errmsg\n$USAGE"; }; my $help; GetOptions( help => \$help, verbose => \$Verbose, alphabetic => \$Alpha_Only, uppercase => \$Uppercase, lowercase => \$Lowercase, prefix => \$Prefix, ); if($help) { print $FULL_USAGE; exit 0; } } die "$ME: must only supply one of -u or -l arguments\n$USAGE" if $Uppercase && $Lowercase; ($Input_Dict, $Output_DB) = @ARGV; warn "$ME: ignoring extra arguments\n" if @ARGV > 2; die "$ME: missing required input dictionary filename argument\n$USAGE" if !defined($Input_Dict) || $Input_Dict eq ''; die "$ME: missing required output BerkeleyDB filename argument\n$USAGE" if !defined($Output_DB) || $Output_DB eq ''; # ------------------------------------------------------------------------ # Main loop open my $fh, '<', $Input_Dict or die "$ME: could not open $Input_Dict: $!\n"; my %out; tie %out, 'BerkeleyDB::Hash', -Filename => $Output_DB, -Flags => DB_CREATE, or die "could not open $Output_DB for writing: " . $BerkeleyDB::Error || $! || 'unknown error'; while(<$fh>) { s/^\s+//; s/#.*$//; s/\s+$//; next unless /\S/; s/\W+//g if $Alpha_Only; my $word = $Uppercase ? uc($_) : $Lowercase ? lc($_) : $_; $out{$word} = 1; if($Prefix) { while(length($word) > 1) { $word = substr($word, 0, length($word) - 1); $out{$word} = 0 unless exists $out{$word}; } } } close $fh; exit 0; # ======================================================================== __END__