Florida

This is an extension of Flcelloguy's Tool, which processes the HTML from a contribs file and parses it into a file readable by the tool. It is not finished, but the core is done, so I'm publishing it as v1.00.

It won't run, since the functions written by Flcelloguy (which actually count the contributions) are in a separate file (available at User:Flcelloguy/Tool). All it does is parse the contributions. Tito xd^{(?!? - did you read this?)} 04:21, 8 December 2005 (UTC)[reply]

Revisions

v1.00: Original version, parses contribs to HTML file
v1.01: Revision, split into a separate class, removed print command to system buffer (slowing down tool, only used for debugging)
v2.00: begun processing the raw HTML file, parsed date/time stamp and page name into a special "Contrib" class. Minor edits, edit summaries and most recent edits still to be implemented.
v2.10: Minor edits implemented, some code for edit summaries created (not operational yet).

Code

PurgeContribs.java

/**
 * HTML -> ContribFile converter for Flcelloguy's Tool
 * @see [[User:Flcelloguy/Tool]]
 * @author Titoxd
 * @version 1.00
 * @docRoot: http://en.wikipedia.org/wiki/User:Titoxd/Flcelloguy's_Tool
 * @copyright: Permission is granted to distribute freely, provided attribution is granted.
 */
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.StringTokenizer;

import javax.swing.JOptionPane;

public class PurgeContribs
{
	public static void main (String args[]) throws IOException
	{
		String inFile$= "";
		inFile$ = JOptionPane.showInputDialog("Input file:", inFile$);
		String outFile$ ="";
		outFile$ = JOptionPane.showInputDialog("Output file:", outFile$);
		Purge(inFile$, outFile$);
	}

	/** 
	 * @param contribsFile (input file in raw HTML)
	 * @param listFile (output file, for the moment in raw HTML, will be modified later to process contribs easier)
	 * @throws IOException 
	 */
		public static void Purge(String contribsFile, String listFile) throws IOException
		{
			FileReader reader = new FileReader(contribsFile);
			BufferedReader in = new BufferedReader(reader);
			
			FileWriter writer = new FileWriter(listFile);
			BufferedWriter out = new BufferedWriter(writer);
			
			String inString ="";
			String outString ="";
			boolean endContribs = false;		//marks whether all the contributions have been parsed
			
			inString = in.readLine();	//read from file and discard
			
			do
			{
				if ((inString.trim().compareTo("<ul>")==0) && (endContribs == false)) 	//until the <ul> tag is reached,
				{
					do
					{
						inString = in.readLine();	//then start reading and recording
						if ((inString.trim().compareTo("</ul>")!=0))
						{
							//System.out.println(inString.trim());
							outString = Parse(inString.trim()).pageName;
							//System.out.println(outString.trim());
							
							out.write(inString.trim(),0,inString.length());
							out.newLine();
						}
						else
						{
							endContribs = true;
						}
					} while (endContribs != true);
				}
				inString = in.readLine();	//read from file and discard
			} while (inString != null);
			in.close();
			out.close();
		}

		public static Contrib Parse(String purgedLine) throws IOException
		{
			/**** Take out the <li> tags ****/
			String midString1;
			String timeStamp;
			String editSummary = null;
			boolean minorEdit = false;
			boolean endLoop = false;
			boolean newestEdit = false;
			midString1 = purgedLine.substring(4,purgedLine.length()-5);
			
			/**** Process the time stamp ****/
			StringTokenizer token;
			token = new StringTokenizer(midString1.trim());
			{
				String time = token.nextToken();
				String day = token.nextToken();
				String month = token.nextToken();
				String year = token.nextToken(); 
				timeStamp = time + " " + day + " " + month + " " + year;
			}
			
			/**** Process the page name ****/
			
			String dummy = token.nextToken();  //get rid of (<a
			String URL = token.nextToken();
			String pageName = URL.substring(25,URL.length()-20);
			
			/**** Get rid of a few extra tokens ****/
			
			do
			{
				endLoop = false;
				dummy = token.nextToken();
				if (dummy.lastIndexOf('<') != -1)
				{
					if (dummy.substring(dummy.lastIndexOf('<'),dummy.lastIndexOf('<')+3).compareTo("</a>") != 0) endLoop = true;
				}
			}
			while (endLoop==false);
			
			/**** Do the same with the diff link ****/
			dummy = token.nextToken();  //get rid of (<a
			String dummyURL = token.nextToken(); 	//this URL is not needed, so it is dummied out
			String dummyPageName = URL.substring(25,dummyURL.length()-20); 	//ditto
			do
			{
				endLoop = false;
				dummy = token.nextToken();
				if (dummy.lastIndexOf('<') != -1)
				{
					if (dummy.substring(dummy.lastIndexOf('<'),dummy.lastIndexOf('<')+3).compareTo("</a>") != 0) endLoop = true;
				}
			}
			while (endLoop==false);
			
			
			/**** Determine if edit is minor or not ****/
			dummy = token.nextToken();  //get rid of (<span
			dummy = token.nextToken();  //read the next token; it should be class="minor">m</span> if a minor edit 
			if (dummy.compareTo("class=\"minor\">m</span>") == 0)
			{
				minorEdit = true;
				dummyPageName = null;
			}
			else
			{
				minorEdit = false;
				dummyPageName = dummy;
			}

			if (dummyPageName == null)	//if it was a minor edit, advance token cursor to match non-minor edits
			{
				dummy = token.nextToken(); //get rid of <a
				dummyPageName = token.nextToken();
			}
			
			do
			{
				endLoop = false;
				dummy = token.nextToken();
				if (dummy.lastIndexOf('<') != -1)
				{
					if (dummy.substring(dummy.lastIndexOf('<'),dummy.lastIndexOf('<')+3).compareTo("</a>") != 0) endLoop = true;
				}
			} while (endLoop==false);
			
			/** The following code to process the edit summaries is commented out, because it doesn't work yet. **/

			/*if (token.hasMoreTokens() == true) 
			{
				dummy = token.nextToken(); 				//read whether it is <span or <strong>
				if (dummy.compareTo("<span") == 0)		//<span: there is an edit summary
				{
					String dummySummary = token.nextToken();
					
				}
				
				//class='comment'>(added link to magnesium hydroxide page)</span>
				//(top)</strong>
			}
			else			//no edit summaries, edit is not the most recent edit to page
			{
				editSummary = null;
				newestEdit = false;
			}
			*/

			Contrib contrib = new Contrib(timeStamp, pageName, minorEdit, null, false);
			return contrib;
	}
}

Contrib.java

/**
 * Contribution class for Flcelloguy's Tool
 * @see [[User:Flcelloguy/Tool]]
 * @author Titoxd
 * @version 1.00
 * @docRoot: http://en.wikipedia.org/wiki/User:Titoxd/Flcelloguy's_Tool
 * @copyright: Permission is granted to distribute freely, provided attribution is granted.
 */

public class Contrib
{

	public Contrib(String inStamp, String inName, boolean inMin, String inSummary, boolean inTop)
	{
		timeStamp = inStamp;
		pageName = inName;
		namespace = FindNameSpace(pageName);
		shortName = FindShortName(pageName);
		minorEdit = inMin;
		editSummary= inSummary;
		topEdit = inTop;
		
	}

	private String FindNameSpace(String inName)
	{
		String[] nameArray=inName.split(":",2);
		if (nameArray[0].compareTo(inName)==0)
		{
			nameArray[0] = namespaceArray[0];
		}
		return nameArray[0];
	}
	
	private String FindShortName(String inName)
	{
		String name=null;
		if (inName.contains(":"))
		{
			String[] nameArray=inName.split(":",2);
			name = nameArray[1];
		}
		else
		{
			name = inName;
		}
		return name;
	}
	
	
	public String timeStamp;
	public String pageName;
	public String namespace;
	public String shortName;
	public boolean minorEdit;
	public String editSummary;
	public boolean topEdit;
	private static String[] namespaceArray = 		//list of namespaces from [[Wikipedia:Namespace]]
		{
		"Main", "Talk", "User", "User_talk", 
		"Wikipedia", "Wikipedia_talk", "Image", "Image_talk", "MediaWiki", "MediaWiki_talk", 
		"Template", "Template_talk", "Help", "Help_talk", "Category", "Category_talk", 
		"Portal", "Portal_talk", "Media", "Special",
		}; 

	
	public String toString()
	{
		String returnString = "Time: " + timeStamp + "\r" + 
			"Page: " + pageName + " (Namespace: " + namespace + "; Article: " + shortName + ")\r" + 
			"Minor edit: " + minorEdit + "\r" + 
			"Edit Summary: " + editSummary + "\r" +
			"Most recent edit: " + topEdit;
		return returnString;
	}
}

The best road to progress is freedom's road. - JFK

Florida

Revision as of 02:53, 9 December 2005

Revisions

Code

PurgeContribs.java

Contrib.java