chunkster/perl
From twext
814 describes how one chunkster solution might work
below is perl code that once chunked text:
#!/usr/bin/perl
#****h* TwEXT_WWW/chunkster.pl [0.9] *
# NAME
# chunkster.pl
# COPYRIGHT
# (c) 1996-2000 TwEXT EDitions
# FUNCTION
# Chunks a text applying the POST, MID, PRE and EXCEPTIONS gramathical rules.
# NOTES
# This script is included in edit.pl script which is used to create or
# fix twext editions.
# AUTHOR
# Urivan Saaib <saaib@c-ber.net>
# SEE ALSO
# login.pl, edit.pl, prefs.pl
# CREATION DATE
# 17-Jun-1999
#***
#****f* chunkster.pl/Chunkster
# NAME
# Chunkster
# SYNOPSIS
# &Chunkster(*TwEXT,$User)
# FUNCTION
# Chunks a text applying the POST, MID, PRE and EXCEPTIONS gramathical rules
# INPUTS
# $TwEXT -- Text to apply the chunks method
# $User -- User nickname where to get the gramathical rules
# RETURN VALUE
# $TwEXT -- The text chunked
# SOURCE
#
sub Chunkster
{
local(*TwEXT,$User)=@_ if @_;
local ($Pref_Folder,$Item_Number,$Item,$Exceptions_Array,$Exception,$Chunk_Item);
if ($PREF_CHUNK eq "1") {return;} # manual chunking
elsif ($PREF_CHUNK eq "5") {
$Pref_Folder="$ROOT_DIR/pref/SYSTEM_MEDIUM.chk_$MLANG"; }
elsif ($PREF_CHUNK eq "4") {
$Pref_Folder="$ROOT_DIR/pref/SYSTEM_WIDE.chk_$MLANG"; }
elsif ($PREF_CHUNK eq "3") {
$Pref_Folder="$ROOT_DIR/pref/SYSTEM_MEDIUM.chk_$MLANG"; }
elsif ($PREF_CHUNK eq "2") {
$Pref_Folder="$ROOT_DIR/pref/SYSTEM_NARROW.chk_$MLANG"; }
elsif ($PREF_CHUNK eq "lyric") {
$Pref_Folder="$ROOT_DIR/pref/SYSTEM_LYRIC.chk_$MLANG"; }
elsif ($PREF_CHUNK eq "0") {
if ( -r "$ROOT_DIR/pref/$User.chk_$MLANG" ) {
$Pref_Folder="$ROOT_DIR/pref/$User.chk_$MLANG";
} else { $Pref_Folder="$ROOT_DIR/pref/SYSTEM_GENERAL.chk_$MLANG"; } }
elsif ($FORM{chunk} eq "00") { return; }
open (CHUNKSTER,"$Pref_Folder") || die "Unable to open file .\n";
while (<CHUNKSTER>)
{
chop;
($title,$words)= split(/=/,$_);
$CHK_WORDS{$title} = $words;
}
# Split each word sequence into an array
@SentenceChunk = split(/\|/,$CHK_WORDS{sentences});
@MidChunk = split(/\|/,$CHK_WORDS{mid});
@PostChunk = split(/\|/,$CHK_WORDS{post});
@PreChunk = split(/\|/,$CHK_WORDS{pre});
@ExceptChunk = split(/\|/,$CHK_WORDS{exceptions});
close(CHUNKSTER);
$TwEXT =~ s/\r\n/\n/g; # DOS 2 UNIX
$TwEXT =~ s/\n\n\n+/\<PARAGRAPH\>/g; # mark Paragrafs
$TwEXT =~ s/\n\n+/\<SENTENCE\>/g; # Mark Sentences
$TwEXT =~ s/\n/\<ENTER\>/g; # mark single new lines
#$TwEXT =~ s/\<PARAGRAPH\>/\n\n/g;
#$TwEXT =~ s/\<SENTENCE\>/\n/g;
# Change Exceptions before anything shit happens
$Item_Number=1;
foreach $Item (@ExceptChunk)
{
$Exceptions_Array{$Item_Number} = $Item;
$Item_Number++;
}
$Item_Number = 1;
foreach $Exception (@ExceptChunk)
{
$TwEXT =~ s/$Exception/\<$Item_Number\>/;
$Item_Number++;
}
# mark sentences
$Chunk_Item="";
foreach $Item (@SentenceChunk)
{
if (length($Item) == 1 && $Item ne " ") { $Chunk_Item = "\\" . $Item; }
else { $Chunk_Item = $Item; }
$TwEXT =~ s/$Chunk_Item/$Item\n\n/g;
}
# do POSTChunks
$flag=0;
$Chunk_Item="";
foreach $Item (@PostChunk)
{
if (length($Item) == 1 && $Item ne " ")
{ $Chunk_Item ="\\".$Item." "; }
else { $Chunk_Item = "$Item"; }
$TwEXT =~ s/$Chunk_Item/$Item\n/g;
}
# do MIDChunks
foreach $Item (@MidChunk)
{
if (length($Item) == 1)
{ $Chunk_Item ="\\".$Item." "; }
else { $Chunk_Item = "$Item "; }
$TwEXT =~ s/$Chunk_Item/$Item\n/g;
}
# do PREChunks
$Chunk_Item="";
foreach $Item (@PreChunk)
{
if (length($Item) == 1)
{ $Chunk_Item ="\\".$Item." "; }
else { $Chunk_Item = "$Item "; }
$TwEXT =~ s/ $Chunk_Item/\n$Item /g;
}
# Restore the Exceptions Texts
$Item_Number = 1;
foreach $Exception (@ExceptChunk)
{
$TwEXT =~ s/\<$Item_Number\>/$Exceptions_Array{$Item_Number}/;
$Item_Number ++;
}
# restore paragraphs, sentences and single enters
$TwEXT =~ s/\<PARAGRAPH\>/\n\n\n/g;
$TwEXT =~ s/\<SENTENCE\>/\n\n/g;
$TwEXT =~ s/\<ENTER\>/\n/g;
$TwEXT =~ s/\n /\n/g;
}
#***
1;

