chunkster/perl

From twext

Jump to: navigation, search

814 describes how one chunkster solution might work

below is perl code that once chunked text:

#!/usr/bin/perl

#****h* TwEXT_WWW/chunkster.pl [0.9] *
# NAME
#   chunkster.pl
# COPYRIGHT
#   (c) 1996-2000 TwEXT EDitions
# FUNCTION
#   Chunks a text applying the POST, MID, PRE and EXCEPTIONS gramathical rules.
# NOTES
#   This script is included in edit.pl script which is used to create or
#   fix twext editions.
# AUTHOR
#   Urivan Saaib <saaib@c-ber.net>
# SEE ALSO
#   login.pl, edit.pl, prefs.pl
# CREATION DATE
#   17-Jun-1999
#*** 

#****f* chunkster.pl/Chunkster
# NAME
#   Chunkster
# SYNOPSIS
#   &Chunkster(*TwEXT,$User)
# FUNCTION
#   Chunks a text applying the POST, MID, PRE and EXCEPTIONS gramathical rules
# INPUTS
#   $TwEXT    --  Text to apply the chunks method
#   $User     --  User nickname where to get the gramathical rules
# RETURN VALUE
#   $TwEXT    --  The text chunked
# SOURCE
#
sub Chunkster 
{
  local(*TwEXT,$User)=@_ if @_;
  local ($Pref_Folder,$Item_Number,$Item,$Exceptions_Array,$Exception,$Chunk_Item); 

  if ($PREF_CHUNK eq "1") {return;}        # manual chunking
  elsif ($PREF_CHUNK eq "5") { 
	 $Pref_Folder="$ROOT_DIR/pref/SYSTEM_MEDIUM.chk_$MLANG"; }
  elsif ($PREF_CHUNK eq "4") { 
         $Pref_Folder="$ROOT_DIR/pref/SYSTEM_WIDE.chk_$MLANG"; }
  elsif ($PREF_CHUNK eq "3") { 
	 $Pref_Folder="$ROOT_DIR/pref/SYSTEM_MEDIUM.chk_$MLANG"; }
  elsif ($PREF_CHUNK eq "2") {
	 $Pref_Folder="$ROOT_DIR/pref/SYSTEM_NARROW.chk_$MLANG"; }
  elsif ($PREF_CHUNK eq "lyric")  {
	 $Pref_Folder="$ROOT_DIR/pref/SYSTEM_LYRIC.chk_$MLANG"; }
  elsif ($PREF_CHUNK eq "0") {
     if ( -r "$ROOT_DIR/pref/$User.chk_$MLANG" ) {
        $Pref_Folder="$ROOT_DIR/pref/$User.chk_$MLANG";
     } else { $Pref_Folder="$ROOT_DIR/pref/SYSTEM_GENERAL.chk_$MLANG"; } }
  elsif ($FORM{chunk} eq "00") { return; } 

  open (CHUNKSTER,"$Pref_Folder") || die "Unable to open file .\n";
  while (<CHUNKSTER>)
  {
    chop;
    ($title,$words)= split(/=/,$_);
    $CHK_WORDS{$title} = $words;
  }
  # Split each word sequence into an array
  @SentenceChunk = split(/\|/,$CHK_WORDS{sentences});
  @MidChunk = split(/\|/,$CHK_WORDS{mid});
  @PostChunk = split(/\|/,$CHK_WORDS{post});
  @PreChunk = split(/\|/,$CHK_WORDS{pre});
  @ExceptChunk = split(/\|/,$CHK_WORDS{exceptions});
   
  close(CHUNKSTER);
   
   $TwEXT =~ s/\r\n/\n/g;   # DOS 2 UNIX
   $TwEXT =~ s/\n\n\n+/\<PARAGRAPH\>/g; # mark Paragrafs
   $TwEXT =~ s/\n\n+/\<SENTENCE\>/g;    # Mark Sentences
   $TwEXT =~ s/\n/\<ENTER\>/g;          # mark single new lines 

   #$TwEXT =~ s/\<PARAGRAPH\>/\n\n/g;
   #$TwEXT =~ s/\<SENTENCE\>/\n/g;
  

   # Change Exceptions before anything shit happens
   $Item_Number=1;
   foreach $Item (@ExceptChunk)
   {
      $Exceptions_Array{$Item_Number} = $Item;
      $Item_Number++;
   }
   $Item_Number = 1;
   foreach $Exception (@ExceptChunk)
   {
      $TwEXT =~ s/$Exception/\<$Item_Number\>/;
      $Item_Number++;
   }

   # mark sentences
   $Chunk_Item="";
   foreach $Item (@SentenceChunk)
   {
      if (length($Item) == 1 && $Item ne " ") { $Chunk_Item = "\\" . $Item; }
      else { $Chunk_Item = $Item; }
      $TwEXT =~ s/$Chunk_Item/$Item\n\n/g;
   }
   
   # do POSTChunks
   $flag=0;
   $Chunk_Item="";
   foreach $Item (@PostChunk)
   {
      if (length($Item) == 1 && $Item ne " ") 
      { $Chunk_Item ="\\".$Item." "; }
     else { $Chunk_Item = "$Item"; }
     $TwEXT =~ s/$Chunk_Item/$Item\n/g;
   }

   # do MIDChunks 
   foreach $Item (@MidChunk)
   {
      if (length($Item) == 1) 
      { $Chunk_Item ="\\".$Item." "; }
     else { $Chunk_Item = "$Item "; }
     $TwEXT =~ s/$Chunk_Item/$Item\n/g;
   }
   
   # do PREChunks
   $Chunk_Item="";
   foreach $Item (@PreChunk)
   {
      if (length($Item) == 1) 
      { $Chunk_Item ="\\".$Item." "; }
     else { $Chunk_Item = "$Item "; }
     $TwEXT =~ s/ $Chunk_Item/\n$Item /g;
   }
   
   # Restore the Exceptions Texts
   $Item_Number = 1;
   foreach $Exception (@ExceptChunk)
   {
      $TwEXT =~ s/\<$Item_Number\>/$Exceptions_Array{$Item_Number}/;
      $Item_Number ++;
   } 

   # restore paragraphs, sentences and single enters
   $TwEXT =~ s/\<PARAGRAPH\>/\n\n\n/g;
   $TwEXT =~ s/\<SENTENCE\>/\n\n/g;
   $TwEXT =~ s/\<ENTER\>/\n/g;
   
   $TwEXT =~ s/\n /\n/g;
}
#***
1;
Personal tools