/*	Words

PIRL CVS ID: Words.java,v 1.24 2012/04/16 06:15:36 castalia Exp

Copyright (C) 2003-2012  Arizona Board of Regents on behalf of the
Planetary Image Research Laboratory, Lunar and Planetary Laboratory at
the University of Arizona.

This file is part of the PIRL Java Packages.

The PIRL Java Packages are free software; you can redistribute them
and/or modify them under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation, either version 3 of
the License, or (at your option) any later version.

The PIRL Java Packages are distributed in the hope that they will be
useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
General Public License for more details.

You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.

*******************************************************************************/

package PIRL.Strings;

import java.util.Vector;
import java.lang.IndexOutOfBoundsException;

/**	The <I>Words</I> class provides a mechanism to treat a String as a
	sequence of delimited words.
<p>
@see		String_Buffer
@author		Bradford Castalia, UA/PIRL
@version	1.24
*/
public class Words
{
public static final String
	ID = "PIRL.Strings.Words (1.24 2012/04/16 06:15:36)";

/**	The index in the current string where the next word starts.
<p>
	Initially, this is the beginning (0) of the words string. This will be
	-1 when there are no more words.
*/
public int
	Start_Index				= 0;

/**	The index (exclusive) in the current string where the current word ends.
<p>
	This will be zero if no {@link #Next_Word() Next_Word} has been
	selected. This will be -1 when there are no more words.
*/
public int
	End_Index				= 0;

/**	The Word_Index the last {@link #Mark() marked} word.
*/
public Word_Index
	Mark_Index				= new Word_Index ();

/**	The default word delimiters;
<p>
	The usual whitespace characters: " \n\r\t".
*/
public static final String
	DEFAULT_DELIMITERS		= " \n\r\t";

/**	The default word {@link #Mask(String) mask}.
*/
public static final String
	DEFAULT_MASK			= "*******";

private String
	Delimiters				= DEFAULT_DELIMITERS,
	Word_Mask  				= DEFAULT_MASK;

private String_Buffer
	Characters				= new String_Buffer ();

/**	Whether or not to treat quoted strings as a word.
*/
public static boolean
	QUOTED_WORDS			= true;
private boolean
	Quoted_Words			= QUOTED_WORDS;

/**	Whether or not to delimit words at quotes.
*/
public static boolean
	DELIMIT_AT_QUOTE		= true;
private boolean
	Delimit_at_Quote		= DELIMIT_AT_QUOTE;

/**	Whether or not to treat parenthesized strings as a word.
*/
public static boolean
	PARENTHESIZED_WORDS		= false;
private boolean
	Parenthesized_Words		= PARENTHESIZED_WORDS;


//  DEBUG control.
private static final int
	DEBUG_OFF			= 0,
	DEBUG_CONSTRUCTORS	= 1 << 0,
	DEBUG_ACCESSORS		= 1 << 1,
	DEBUG_METHODS		= 1 << 2,
	DEBUG_ALL			= -1,

	DEBUG       		= DEBUG_OFF;

/*==============================================================================
	Constructors
*/
/**	Constructs Words from a String of characters.
<p>
	@param	characters	The String of characters containing words.
*/
public Words
	(
	String	characters
	)
{Characters = new String_Buffer (characters);}

/**	Constructs Words with no characters.
<p>
	@see	#Characters(String)
*/
public Words ()
{}

/*==============================================================================
	Accessors
*/
/**	Gets the Words characters.
<p>
	@return	The current String of characters.
*/
public String toString ()
{return Characters.toString ();}

/**	Sets the String of characters.
<p>
	The current location is reset to the beginning of the string.
	The {@link #Mark_Index} is reset to (0,0).
<p>
	@param	characters	The String of characters.
	@return	This Words object.
	@see	#Location(int)
*/
public Words Characters
	(
	String	characters
	)
{
Characters = new String_Buffer (characters);
Mark_Index = new Word_Index ();
return Location (0);
}

/**	Sets the word delimiter characters.
<p>
	A word is delimited by a contiguous sequence of characters that are
	all members of the delimiters characters. Note that a sequence of
	more than one the same or different characters from the delimiters
	set does not result in empty words; i.e. any continguous sequence of
	one or more delimiters is treated as a single word delimiter.
<p>
	<b>N.B.</b>: Any character starting a special sequence should not
	be included as one of the delimiter characters. If they are then
	special sequence recognition will be effectively disabled.
<p>
	@param	delimiters	The String of delimiter characters. If null,
		the {@link #DEFAULT_DELIMITERS} will be used.
	@return	This Words object.
	@see	#Quoted_Words(boolean)
	@see	#Parenthesized_Words(boolean)
*/
public Words Delimiters
	(
	String	delimiters
	)
{
if (delimiters == null)
	delimiters = DEFAULT_DELIMITERS;
Delimiters = delimiters;
return this;
}

/**	Gets the current delimiters.
<p>
	@return	The String of delimiter characters.
*/
public String Delimiters ()
{return Delimiters;}

/**	Enable or disable the treatment of quoted strings as words.
<p>
	The enclosing quote characters are included in the word. If there is
	no matching unescaped quote character before the end of the string,
	the resulting word will not have the matching closing quote character
	at {@link #End_Index} - 1.
<p>
	@param	enable	true if all characters within unescaped quotes
		(' or ") are to be treated as a single word; false otherwise.
	@return	This Words object.
	@see	#Delimit_at_Quote(boolean)
	@see	#Next_Location()
*/
public Words Quoted_Words
	(
	boolean	enable
	)
{Quoted_Words = enable; return this;}

/**	Test if quoted strings are treated as single words.
<p>
	@return	true if all characters within unescaped quotes (' or ") will
		be treated as a single word; false otherwise.
	@see	#Quoted_Words(boolean)
*/
public boolean Quoted_Words ()
{return Quoted_Words;}

/**	Enable or disable delimiting words at quotes.
<p>
	When unescaped quote characters are encountered they may delimit a
	word even if no {@link #Delimiters(String) delimiter character}
	preceeds or follows the quote. Disabling quote delimiting causes
	contiguous non-delimiter characters to be included as part of the
	quoted string word. The quotes remain in the word in either case.
<p>
	@param	enable	true if unescaped quote characters delimit a word;
		false otherwise.
	@return	This Words object.
	@see	#Quoted_Words(boolean)
*/
public Words Delimit_at_Quote
	(
	boolean	enable
	)
{Delimit_at_Quote = enable; return this;}

/**	Test if quotes will delimit words.
<p>
	@return	true if unescaped quote characters delimit a word;
		false otherwise.
	@see	#Delimit_at_Quote(boolean)
*/
public boolean Delimit_at_Quote ()
{return Delimit_at_Quote;}

/**	Enable or disable the treatment of parenthesized strings as words.
<p>
	<b>N.B.</b>: Nested parenthesized strings are included in a
	parenthesized string.
<p>
	The enclosing parentheses characters are included in the word. If
	there is no matching unescaped closing parenthesis character
	(ignoring nested parentheses) before the end of the string, the
	resulting word will not have one at {@link #End_Index} - 1.
<p>
	@param	enable	true if all characters within unescaped parenthesized
		 ('(' and ')') strings are to be treated as a single word; false
		 otherwise.
	@return	This Words object.
	@see	#Next_Location()
*/
public Words Parenthesized_Words
	(
	boolean	enable
	)
{Parenthesized_Words = enable; return this;}

/**	Test if parenthesized strings are treated as single words.
<p>
	@return	true if all characters within unescaped parenthesized strings
		will be treated as a single word; false otherwise.
	@see	#Parenthesized_Words(boolean)
*/
public boolean Parenthesized_Words ()
{return Parenthesized_Words;}

/**	Gets a substring of the characters.
<p>
	@param	start	The start index of the substring.
	@param	end		The end index of the substring.
	@return	The substring from the start index up to, but not including,
		the end index.
	@see	StringBuffer#substring(int, int)
*/
public String Substring
	(
	int		start,
	int		end
	)
{return Characters.substring (start, end);}

/**	Gets a substring of the characters.
<p>
	@param	start	The start index of the substring.
	@return	The substring from the start index to the end of the
		characters string.
	@see	StringBuffer#substring(int)
*/
public String Substring
	(
	int		start
	)
{return Characters.substring (start);}

/**	Gets a substring of the characters.
<p>
	@param	word_index	A Word_Index for the substring.
	@return	The substring from the word_index.Start_Index up to, but not
		including, the word_index.End_Index.
	@see	StringBuffer#substring(int, int)
*/
public String Substring
	(
	Word_Index	word_index
	)
{return Characters.substring (word_index.Start_Index, word_index.End_Index);}

/*==============================================================================
	Word_Index
*/
/**	A <i>Word_Index</i> provides a start,end string location for a word.
*/
public class Word_Index
{
/**	A string index where a word starts.
*/
public int
	Start_Index;

/**	A string index where a word ends.
*/
public int
	End_Index;

/**	Constructs a Word_Index at (0,0).
*/
public Word_Index ()
{
Start_Index =
End_Index   = 0;
}

/**	Constructs a Word_Index for start and end word locations.
*/
public Word_Index
	(
	int		start,
	int		end
	)
{
Start_Index = start;
End_Index   = end;
}
}

/*==============================================================================
	Manipulators
*/
/**	Moves the word indices to a new location.
<p>
	The location must be within the words string.
<p>
	@param	location	An index in the words string.
	@return	This Words object.
	@throws	IndexOutOfBoundsException	If the location is not
		within the words string.
*/
public Words Location
	(
	int		location
	)
	throws IndexOutOfBoundsException
{
if (location < 0 || location > Characters.length ())
	throw new IndexOutOfBoundsException (ID + '\n' +
		"Location " + location + " is outside the words string length of " +
		Characters.length ());
Start_Index = End_Index = location;
return this;
}

/**	Marks the current word location.
<p>
	The current {@link #Start_Index} and {@link #End_Index} are stored
	in the {@link #Mark_Index}.
<p>
	@return	This Words object.
	@see	#Restore()
*/
public Words Mark ()
{
Mark_Index = new Word_Index (Start_Index, End_Index);
return this;
}

/**	Restores the current word to the last marked location.
<p>
	@return	This Words object.
	@throws	StringIndexOutOfBoundsException	If the Word_Index.Start_Index
		is less than zero or the Word_Index.End_Index is greater than the
		number of characters available.
	@see	#Mark()
*/
public Words Restore ()
{
if (Mark_Index.Start_Index < 0 ||
	Mark_Index.End_Index > Characters.length ())
	throw new StringIndexOutOfBoundsException (ID + '\n'
		+ "Can't restore the invalid Mark_Index ("
			+ Mark_Index.Start_Index + ", " + Mark_Index.End_Index);
Start_Index = Mark_Index.Start_Index;
End_Index   = Mark_Index.End_Index;
return this;
}

/**	Moves the word indices to the location of the next word.
<p>
	Beginning at the current {@link #End_Index} all {@link
	#Delimiters(String) delimiter characters} are skipped to find the new
	{@link #Start_Index}. If the end of the characters string is reached
	without finding a non-delimiter character then there are no more
	words available. In this case both the Start_Index and End_Index will
	be equal to the character string length and nothing more will be
	done.
<p>
	<b>N.B.</b>: Any character starting a special sequence should not
	be included as one of the delimiter characters. If they are then
	special sequence recognition will be effectively disabled.
<p>
	The character at the Start_Index is checked to see if it starts a
	special sequence. If {@link #Quoted_Words(boolean) quoted words} is
	enabled either a single (') or double (") quote character will be
	recognized and set as the end of sequence marker character. If {@link
	#Parenthesized_Words(boolean) parenthesized words} is enabled an
	opening parenthesis ('(') character will be recognized and the end of
	sequence marker character will be set to the closing parenthesis
	(')') character. A special sequence start character is included as
	part of the word.
<p>
	When {@link #Delimit_at_Quote(boolean) delimit at quote} is enabled
	in addition to quoted words being enabled quoted strings are
	delimited as separate words even if a contiguous non-delimiter
	character preceeds and/or follows the enclosing quotes. When delimit
	at quote is disabled the contiguous non-delimiter characters are
	treated as part of the word that includes the quoted string.
<p>
	The word contains all characters up to and including an unescaped end
	of sequence marker character. For a parenthesized sequence the marker
	character must be at parenthesis level zero to end the sequence;
	unescaped nested parentheses increase the parenthesis level. Note
	that a special sequence may include what would otherwise be
	considered delimiter characters, and the enclosing characters -
	quotes or parentheses - are included as part of the word. If the end
	of the characters string is reached before the expected marker
	character is found the resulting word will be "unbalanced"; the
	character at End_Index - 1 will not be the marker character.
<p>
	If no end of sequence marker character has been set, then the word
	will end when any unescaped delimiter character is found or the end
	of the characters string is reached. If quoted words are enabled a
	quote character will be recognized as a delimiter character. If
	parenthesized words are enabled an opening parenthesis character will
	be recognized as a delimiter character. The index of the delimiter
	character becomes the new End_Index; it is not included as part of
	the word.
<p>
	Any character preceded by a backslash ('\') character is escaped from
	any special treatment. All escaped characters are taken to be part of
	the word, the backslash character included.
<p>
	@return	A Word_Index for the next word. If there are no more words
		the word index will be set to the end of the characters.
*/
public Word_Index Next_Location ()
{
if ((DEBUG & DEBUG_METHODS) != 0)
	System.out.println (">>> Words.Next_Location");
if ((Start_Index = Characters.skip_over (End_Index, Delimiters))
		< Characters.length ())
	{
	//	A non-delimiter character following the current word end index.
	if ((DEBUG & DEBUG_METHODS) != 0)
		System.out.println
			("    Start_Index = " + Start_Index);
	//	Initialize the new end-of-word index to the start index.
	End_Index = Start_Index;

	//	Check for the start of a special sequence.
	int
		length = Characters.length (),
		level = 0;	//	Parentheses nesting level.
	char
		marker = 0,	//	Special sequence end marker character.
		character = Characters.charAt (Start_Index);
	if (Quoted_Words &&
			(character == '"' ||
			 character == '\''))
		{
		if ((DEBUG & DEBUG_METHODS) != 0)
			System.out.println
				("    Quoted with " + character);
		marker = character;
		++End_Index;
		}
	else
	if (Parenthesized_Words &&
			character == '(')
		{
		if ((DEBUG & DEBUG_METHODS) != 0)
			System.out.println
				("    Parenthesized");
		marker = ')';
		++level;
		++End_Index;
		}

	//	Search for the next end-of-word marker.
	Search:
	while (End_Index < length)
		{
		character = Characters.charAt (End_Index);
		if ((DEBUG & DEBUG_METHODS) != 0)
			System.out.println
				("    " + End_Index + ": '" + character + '\'');
		if (character == '\\')
			{
			//	Skip the escaped character.
			if (++End_Index == length)
				break;
			}
		else if (marker == 0)
			{
			//	No special sequence identification in effect.
			if (Delimiters.indexOf (character) >= 0)
				//	Word delimiter found.
				break;

			if (Quoted_Words &&
					(character == '"' ||
					 character == '\''))
				{
				//	Start of a contiguous quoted sequence.
				if (Delimit_at_Quote)
					break;
				else
					{
					//	Search for the end of the sequence.
					char
						quote = character;
					while (++End_Index < length)
						{
						character = Characters.charAt (End_Index);
						if ((DEBUG & DEBUG_METHODS) != 0)
							System.out.println
								("    " + End_Index + ": '" + character + '\'');
						if (character == quote)
							{
							++End_Index;
							break;
							}
						else
						if (character == '\\')
							{
							//	Skip the escaped character.
							if (++End_Index == length)
								break Search;
							}
						}
					continue;
					}
				}

			if (Parenthesized_Words &&
					character == '(')
				//	Start of a contiguous parenthesized sequence.
				break;
			}
		else if (character == marker)
			{
			++End_Index;	//	Always include the marker in the word.
			if (level != 0 &&
				--level != 0)
				//	End of nested parenthesized sequence.
				continue;
			//	End of special sequence.
			break;
			}
		else if (Parenthesized_Words &&
				 character == '(')
			//	Start of nested parenthesized sequence.
			++level;

		//	Word character.
		++End_Index;
		}
	}
else
	//	No more words following the current word.
	End_Index = Start_Index;
if ((DEBUG & DEBUG_METHODS) != 0)
	System.out.println
		("<<< Words.Next_Location: " + Start_Index + ", " + End_Index);
return new Word_Index (Start_Index, End_Index);
}

/**	Gets the next word.
<p>
	The Start_Index will be moved forward from the current End_Index
	over any Delimiters. Then the End_Index will be moved forward from
	the Start_Index until any Delimiters are found or the end of the
	string is reached.
<p>
	@return The substring from the next Start_Index up to, but not including,
		the next End_Index. If there are no more words, the empty String
		will be returned.
	@see	#Next_Location()
*/
public String Next_Word ()
{return Substring (Next_Location ());}

/**	Splits the remaining characters into words.
<p>
	Beginning with the {@link #Next_Word() next word}, words are
	collected into a Vector in the order they occur in the string.
<p>
	If the limit is 0 all available words will be returned; no delimiters
	will be included in any word that is returned. If the limit is
	positive (> 0) no more than limit words will be returned; the last
	"word" will contain all characters, including any delimiters,
	following the start of the last word (delimiters preceeding the last
	word will not be included). A negative limit acts the same as a
	positive limit except the last "word" will contain all characters
	following the end of the previous word (delimiters preceeding the
	last word will be included). Note that a limit of -1 will return all
	characters from the current End_Index to the end of the the
	characters string.
<p>
	Less than limit words my be returned. No empty words will be
	returned.
<p>
	@param	limit	The word limit to return.
	@return	A Vector of zero or more words.
	@see	#Next_Location()
*/
public Vector<String> Split
	(
	int		limit
	)
{
if ((DEBUG & DEBUG_METHODS) != 0)
	System.out.println (">>> Words.Split: " + limit);
Vector<String>
	words = new Vector<String> ();
int
	total = (limit < 0) ? -limit : limit;
while (total-- != 1)
	{
	Next_Location ();
	if (Start_Index < End_Index)
		words.add (Substring (Start_Index, End_Index));
	else
		//	No more words.
		break;
	}
if (Start_Index < End_Index &&
	total >= 0)
	{
	if (limit > 0)
		{
		//	Advance to the next word.
		Next_Location ();
		if (Start_Index < End_Index)
			words.add (Substring (Start_Index));
		}
	else if (Start_Index < End_Index)
		words.add (Substring (End_Index));
	}
if ((DEBUG & DEBUG_METHODS) != 0)
	System.out.println ("<<< Words.Split: " + words);
return words;
}

/**	Splits the remaining characters into words.
<p>
	Beginning with the {@link #Next_Word() next word}, words are
	collected into a Vector in the order they occur in the string.
<p>
	@return	A Vector of zero or more words.
	@see	#Split(int)
*/
public Vector<String> Split ()
{return Split (0);}

/**	Sets the mask to use when words are masked.
<p>
	@param	mask	The mask String. This may be null.
	@return	This Words object.
	@see	#Mask(Vector)
*/
public Words Mask
	(
	String	mask
	)
{Word_Mask = mask; return this;}

/**	Gets the word mask.
<p>
	@return	The String to me used when masking words.
	@see	#Mask(Vector)
*/
public String Mask ()
{return Word_Mask;}

/**	Words preceeded by any one of a set of names are masked.
<p>
	The words are searched for matches with the names. When a match is
	found, the following word is replaced with the mask String. If the
	mask String is null the preceeding name as well its word is
	deleted.
<p>
	<B>N.B.</B>: The mask string may be one of the names. The mask
	substitution is never compared against the names list.
<p>
	@param	names	A Vector of names to find.
	@return	This Words object.
	@see	#Mask(String)
	@see	#Next_Word()
*/
public Words Mask
	(
	Vector<String>	names
	)
{
if ((DEBUG & DEBUG_METHODS) != 0)
	System.out.println (">>> Words.Mask:\n" + names + '\n' + Characters);
int
	mask_length = (Word_Mask == null) ? 0 : Word_Mask.length ();
String
	word;
while ((word = Next_Word ()).length () != 0)
	{
	if ((DEBUG & DEBUG_METHODS) != 0)
		System.out.println ("    + Start_Index = " + Start_Index
			+ ", End_Index = " + End_Index);
	if (names.contains (word))
		{
		if ((DEBUG & DEBUG_METHODS) != 0)
			System.out.println ("    Found: " + word);
		int
			//	Drop the name along with its word.
			start = Start_Index;
		Next_Word ();
		if ((DEBUG & DEBUG_METHODS) != 0)
			System.out.println ("    Next_Word: "
				+ Characters.substring (Start_Index, End_Index));
		if (Word_Mask == null)
			Characters.delete (start, End_Index);
		else if ((start = Start_Index) != End_Index)
			Characters.replace (Start_Index, End_Index, Word_Mask);
		//	Adjust the End_Index for the mask replacement.
		End_Index -= End_Index - start - mask_length;
		//	Advance over the replacement section.
		Start_Index = End_Index;
		if ((DEBUG & DEBUG_METHODS) != 0)
			System.out.println ("    - Start_Index = " + Start_Index
				+ ", End_Index = " + End_Index);
		}
	}
if ((DEBUG & DEBUG_METHODS) != 0)
	System.out.println ("<<< Words.Mask:\n" + Characters);
return this;
}


}	//	End of Words class.
