package html.analysis;

import html.analysis.*;
import html.node.*;
import javax.swing.tree.*;
import java.util.*;
import java.net.*;

public class Extractor extends DepthFirstAdapter
{
    public Hashtable words = null;
    public ArrayList refs = new ArrayList();
    public int totalWords = 0;
    public int totalWeight = 0;
    public String title;
    public static Hashtable common;
    private int inTitle = 0;
    private int inFont = 0;


    public Extractor() { words = new Hashtable(); };

    public Extractor(boolean extractWords) {
	if (extractWords) words = new Hashtable();
    }

    static boolean isNumber(String word) {
	for (int i = 0; i < word.length(); i++) 
	    if (!(Character.isDigit(word.charAt(i)) || word.charAt(i) == '-')) return false;
	return true;
    }

    static String stripNasties(String word) {
	StringBuffer w = new StringBuffer(word.toLowerCase());
	int l = word.length();
	for (int i = 0; i < l; i++) {
	    if (w.charAt(i) == '\''){
		w.deleteCharAt(i); i--; l--;
	    }
	}
	return w.toString();
    }

    static StringBuffer stripNasties(StringBuffer buf) {
	int l = buf.length();
	for (int i = 0; i < l; i++) {
	    if (buf.charAt(i) == '\''){
		buf.deleteCharAt(i); i--; l--;
	    }
	}
	return buf;
    }

    private StringBuffer text = new StringBuffer();

    void appendContents(Node node) {
	if (node instanceof AMixedBlock) text.append(((AMixedBlock)node).getWord().getText()); 
	else if (node instanceof ANormalBlock) {
	    Iterator i = ((ANormalBlock)node).getBlock().iterator();
	    while (i.hasNext()) { appendContents((Node)i.next()); text.append(" "); }
	}
    }

    public class Tuple3 {
	public int first;
	public int second;
	public int third;

	Tuple3(int i, int j, int k) { first = i; second = j; third = k; }

	public String toString() { 
	    return new Integer(first).toString() + " " + 
		new Integer(second).toString() + " " +
		new Integer(third).toString();
	}
    }
	
    public void caseAMixedBlock(AMixedBlock node) {
	if (words == null) return;
	String word = stripNasties(node.getWord().getText());
	if ((common != null) && (common.get(word) != null)) return;
	if (isNumber(word)) return;
        boolean inFirst200 = (totalWords++ <= 200);
	Tuple3 wordVal = (Tuple3)words.get(word);
	int wordInc = inTitle + inFont;
	if (wordInc == 0) wordInc = 1;
	totalWeight = totalWeight + wordInc;
	if (wordVal == null) {
	    if (inFirst200)
		words.put(word,new Tuple3(1,wordInc,1));
	    else words.put(word,new Tuple3(1,wordInc,0));
	}
	else {
	    wordVal.second = wordVal.second + wordInc;
	    if (inFirst200) wordVal.third++;
	    wordVal.first++;		    
	}
    }

    private boolean tagIs(ANormalBlock node, String tag) {
	return ((AOpentag)node.getOpentag()).getTag().toString().trim().equalsIgnoreCase(tag); 
    }

    private int hNumber(ANormalBlock node) {
	return Integer.parseInt(((AOpentag)node.getOpentag()).getNumber().toString().trim());
    }

    private int blockSize(ANormalBlock node) {
	Iterator i = node.getBlock().iterator();
	int count = 0;
	while (i.hasNext()) { count++; i.next(); }
	return count;
    }

    private void parseURL(String atts) {
	for (int i = 0; i < atts.length(); i++)
	    if (atts.regionMatches(true,i,"href",0,4)) {
		try 
		{
		    int index1 = atts.indexOf('=',i+4);
		    int indexQ = atts.indexOf('"',index1);
		    if (indexQ == -1) 
			refs.add(atts.substring(index1+1));
		    else {
			int index2 = atts.indexOf('"',indexQ+1);
			refs.add(atts.substring(indexQ+1,index2)); 
		    }
		}
		catch (Exception e){ }
		break;
	    }
    }

    public void outADoc(ADoc node) {
	Iterator i = words.entrySet().iterator();
	if (totalWords > 1000) {
	    while (i.hasNext()) {
		Tuple3 e = (Tuple3)((Map.Entry)i.next()).getValue();
		e.second = e.second + e.third + e.third;
	    }
	}
	else
	    while (i.hasNext()) {
		Tuple3 e = (Tuple3)((Map.Entry)i.next()).getValue();
		e.second = e.second + e.third;
	    }
    }
    
    public void inANormalBlock(ANormalBlock node) {
	if (tagIs(node,"a")) {
	    TTb tb = ((AOpentag)node.getOpentag()).getTb();
	    if (tb != null)
		parseURL(tb.getText());
	}
	else if (tagIs(node,"title")) {
	    text.setLength(0);
	    appendContents(node);
	    stripNasties(text);
	    title = text.toString();
	    inTitle = 8;
	}
	else if (tagIs(node,"h")) inFont = 6 - hNumber(node);
	else if (tagIs(node,"font") && blockSize(node) <= 5) inFont = 5;
	else if (tagIs(node,"b") && blockSize(node) <= 5) inFont = 5;
	else if (tagIs(node,"i") && blockSize(node) <= 5) inFont = 5;
	else if (tagIs(node,"u") && blockSize(node) <= 5) inFont = 5;
    }

    public void outANormalBlock(ANormalBlock node) {
	if (tagIs(node,"title")) inTitle = 0;
	else if ((tagIs(node,"h")) || (tagIs(node,"font")) || (tagIs(node,"b"))
		 || (tagIs(node,"i")) || (tagIs(node,"u")))
	    inFont = 0;
	}
    }

