Content deleted Content added
→PurgeContribs.java: update to 2.1: minor edits |
m →Contrib.java: upgrade: parses namespaces |
||
(One intermediate revision by the same user not shown) | |||
Line 7: | Line 7: | ||
*'''''[http://en.wikipedia.org/w/index.php?title=User:Titoxd/Flcelloguy%27s_Tool&diff=next&oldid=30556816 v1.01]''''': Revision, split into a separate class, removed print command to system buffer (slowing down tool, only used for debugging) |
*'''''[http://en.wikipedia.org/w/index.php?title=User:Titoxd/Flcelloguy%27s_Tool&diff=next&oldid=30556816 v1.01]''''': Revision, split into a separate class, removed print command to system buffer (slowing down tool, only used for debugging) |
||
*'''''[http://en.wikipedia.org/w/index.php?title=User:Titoxd/Flcelloguy%27s_Tool&diff=30570216&oldid=30558690 v2.00]''''': begun processing the raw HTML file, parsed date/time stamp and page name into a special "<tt>Contrib</tt>" class. Minor edits, edit summaries and most recent edits still to be implemented. |
*'''''[http://en.wikipedia.org/w/index.php?title=User:Titoxd/Flcelloguy%27s_Tool&diff=30570216&oldid=30558690 v2.00]''''': begun processing the raw HTML file, parsed date/time stamp and page name into a special "<tt>Contrib</tt>" class. Minor edits, edit summaries and most recent edits still to be implemented. |
||
*'''''[http://en.wikipedia.org/w/index.php?title=User:Titoxd/Flcelloguy%27s_Tool&diff=30655953&oldid=30570248 v2.10]''''': Minor edits implemented, some code for edit summaries created (not operational yet). |
|||
== Code == |
== Code == |
||
Line 217: | Line 218: | ||
timeStamp = inStamp; |
timeStamp = inStamp; |
||
pageName = inName; |
pageName = inName; |
||
namespace = FindNameSpace(pageName); |
|||
shortName = FindShortName(pageName); |
|||
minorEdit = inMin; |
minorEdit = inMin; |
||
editSummary= inSummary; |
editSummary= inSummary; |
||
Line 222: | Line 225: | ||
} |
} |
||
private String FindNameSpace(String inName) |
|||
{ |
|||
String[] nameArray=inName.split(":",2); |
|||
if (nameArray[0].compareTo(inName)==0) |
|||
{ |
|||
nameArray[0] = namespaceArray[0]; |
|||
} |
|||
return nameArray[0]; |
|||
} |
|||
private String FindShortName(String inName) |
|||
{ |
|||
String name=null; |
|||
if (inName.contains(":")) |
|||
{ |
|||
String[] nameArray=inName.split(":",2); |
|||
name = nameArray[1]; |
|||
} |
|||
else |
|||
{ |
|||
name = inName; |
|||
} |
|||
return name; |
|||
} |
|||
public String timeStamp; |
public String timeStamp; |
||
public String pageName; |
public String pageName; |
||
public String namespace; |
|||
public String shortName; |
|||
public boolean minorEdit; |
public boolean minorEdit; |
||
public String editSummary; |
public String editSummary; |
||
public boolean topEdit; |
public boolean topEdit; |
||
private static String[] namespaceArray = //list of namespaces from [[Wikipedia:Namespace]] |
|||
{ |
|||
"Main", "Talk", "User", "User_talk", |
|||
"Wikipedia", "Wikipedia_talk", "Image", "Image_talk", "MediaWiki", "MediaWiki_talk", |
|||
"Template", "Template_talk", "Help", "Help_talk", "Category", "Category_talk", |
|||
"Portal", "Portal_talk", "Media", "Special", |
|||
}; |
|||
public String toString() |
public String toString() |
||
{ |
{ |
||
String returnString = "Time: " + timeStamp + "\r" + |
String returnString = "Time: " + timeStamp + "\r" + |
||
"Page: " + pageName + "\r" + |
"Page: " + pageName + " (Namespace: " + namespace + "; Article: " + shortName + ")\r" + |
||
"Minor edit: " + minorEdit + "\r" + |
"Minor edit: " + minorEdit + "\r" + |
||
"Edit Summary: " + editSummary + "\r" + |
"Edit Summary: " + editSummary + "\r" + |
Revision as of 02:53, 9 December 2005
This is an extension of Flcelloguy's Tool, which processes the HTML from a contribs file and parses it into a file readable by the tool. It is not finished, but the core is done, so I'm publishing it as v1.00.
It won't run, since the functions written by Flcelloguy (which actually count the contributions) are in a separate file (available at User:Flcelloguy/Tool). All it does is parse the contributions. Titoxd(?!? - did you read this?) 04:21, 8 December 2005 (UTC)
Revisions
- v1.00: Original version, parses contribs to HTML file
- v1.01: Revision, split into a separate class, removed print command to system buffer (slowing down tool, only used for debugging)
- v2.00: begun processing the raw HTML file, parsed date/time stamp and page name into a special "Contrib" class. Minor edits, edit summaries and most recent edits still to be implemented.
- v2.10: Minor edits implemented, some code for edit summaries created (not operational yet).
Code
PurgeContribs.java
/** * HTML -> ContribFile converter for Flcelloguy's Tool * @see [[User:Flcelloguy/Tool]] * @author Titoxd * @version 1.00 * @docRoot: http://en.wikipedia.org/wiki/User:Titoxd/Flcelloguy's_Tool * @copyright: Permission is granted to distribute freely, provided attribution is granted. */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.StringTokenizer; import javax.swing.JOptionPane; public class PurgeContribs { public static void main (String args[]) throws IOException { String inFile$= ""; inFile$ = JOptionPane.showInputDialog("Input file:", inFile$); String outFile$ =""; outFile$ = JOptionPane.showInputDialog("Output file:", outFile$); Purge(inFile$, outFile$); } /** * @param contribsFile (input file in raw HTML) * @param listFile (output file, for the moment in raw HTML, will be modified later to process contribs easier) * @throws IOException */ public static void Purge(String contribsFile, String listFile) throws IOException { FileReader reader = new FileReader(contribsFile); BufferedReader in = new BufferedReader(reader); FileWriter writer = new FileWriter(listFile); BufferedWriter out = new BufferedWriter(writer); String inString =""; String outString =""; boolean endContribs = false; //marks whether all the contributions have been parsed inString = in.readLine(); //read from file and discard do { if ((inString.trim().compareTo("<ul>")==0) && (endContribs == false)) //until the <ul> tag is reached, { do { inString = in.readLine(); //then start reading and recording if ((inString.trim().compareTo("</ul>")!=0)) { //System.out.println(inString.trim()); outString = Parse(inString.trim()).pageName; //System.out.println(outString.trim()); out.write(inString.trim(),0,inString.length()); out.newLine(); } else { endContribs = true; } } while (endContribs != true); } inString = in.readLine(); //read from file and discard } while (inString != null); in.close(); out.close(); } public static Contrib Parse(String purgedLine) throws IOException { /**** Take out the <li> tags ****/ String midString1; String timeStamp; String editSummary = null; boolean minorEdit = false; boolean endLoop = false; boolean newestEdit = false; midString1 = purgedLine.substring(4,purgedLine.length()-5); /**** Process the time stamp ****/ StringTokenizer token; token = new StringTokenizer(midString1.trim()); { String time = token.nextToken(); String day = token.nextToken(); String month = token.nextToken(); String year = token.nextToken(); timeStamp = time + " " + day + " " + month + " " + year; } /**** Process the page name ****/ String dummy = token.nextToken(); //get rid of (<a String URL = token.nextToken(); String pageName = URL.substring(25,URL.length()-20); /**** Get rid of a few extra tokens ****/ do { endLoop = false; dummy = token.nextToken(); if (dummy.lastIndexOf('<') != -1) { if (dummy.substring(dummy.lastIndexOf('<'),dummy.lastIndexOf('<')+3).compareTo("</a>") != 0) endLoop = true; } } while (endLoop==false); /**** Do the same with the diff link ****/ dummy = token.nextToken(); //get rid of (<a String dummyURL = token.nextToken(); //this URL is not needed, so it is dummied out String dummyPageName = URL.substring(25,dummyURL.length()-20); //ditto do { endLoop = false; dummy = token.nextToken(); if (dummy.lastIndexOf('<') != -1) { if (dummy.substring(dummy.lastIndexOf('<'),dummy.lastIndexOf('<')+3).compareTo("</a>") != 0) endLoop = true; } } while (endLoop==false); /**** Determine if edit is minor or not ****/ dummy = token.nextToken(); //get rid of (<span dummy = token.nextToken(); //read the next token; it should be class="minor">m</span> if a minor edit if (dummy.compareTo("class=\"minor\">m</span>") == 0) { minorEdit = true; dummyPageName = null; } else { minorEdit = false; dummyPageName = dummy; } if (dummyPageName == null) //if it was a minor edit, advance token cursor to match non-minor edits { dummy = token.nextToken(); //get rid of <a dummyPageName = token.nextToken(); } do { endLoop = false; dummy = token.nextToken(); if (dummy.lastIndexOf('<') != -1) { if (dummy.substring(dummy.lastIndexOf('<'),dummy.lastIndexOf('<')+3).compareTo("</a>") != 0) endLoop = true; } } while (endLoop==false); /** The following code to process the edit summaries is commented out, because it doesn't work yet. **/ /*if (token.hasMoreTokens() == true) { dummy = token.nextToken(); //read whether it is <span or <strong> if (dummy.compareTo("<span") == 0) //<span: there is an edit summary { String dummySummary = token.nextToken(); } //class='comment'>(added link to magnesium hydroxide page)</span> //(top)</strong> } else //no edit summaries, edit is not the most recent edit to page { editSummary = null; newestEdit = false; } */ Contrib contrib = new Contrib(timeStamp, pageName, minorEdit, null, false); return contrib; } }
Contrib.java
/** * Contribution class for Flcelloguy's Tool * @see [[User:Flcelloguy/Tool]] * @author Titoxd * @version 1.00 * @docRoot: http://en.wikipedia.org/wiki/User:Titoxd/Flcelloguy's_Tool * @copyright: Permission is granted to distribute freely, provided attribution is granted. */ public class Contrib { public Contrib(String inStamp, String inName, boolean inMin, String inSummary, boolean inTop) { timeStamp = inStamp; pageName = inName; namespace = FindNameSpace(pageName); shortName = FindShortName(pageName); minorEdit = inMin; editSummary= inSummary; topEdit = inTop; } private String FindNameSpace(String inName) { String[] nameArray=inName.split(":",2); if (nameArray[0].compareTo(inName)==0) { nameArray[0] = namespaceArray[0]; } return nameArray[0]; } private String FindShortName(String inName) { String name=null; if (inName.contains(":")) { String[] nameArray=inName.split(":",2); name = nameArray[1]; } else { name = inName; } return name; } public String timeStamp; public String pageName; public String namespace; public String shortName; public boolean minorEdit; public String editSummary; public boolean topEdit; private static String[] namespaceArray = //list of namespaces from [[Wikipedia:Namespace]] { "Main", "Talk", "User", "User_talk", "Wikipedia", "Wikipedia_talk", "Image", "Image_talk", "MediaWiki", "MediaWiki_talk", "Template", "Template_talk", "Help", "Help_talk", "Category", "Category_talk", "Portal", "Portal_talk", "Media", "Special", }; public String toString() { String returnString = "Time: " + timeStamp + "\r" + "Page: " + pageName + " (Namespace: " + namespace + "; Article: " + shortName + ")\r" + "Minor edit: " + minorEdit + "\r" + "Edit Summary: " + editSummary + "\r" + "Most recent edit: " + topEdit; return returnString; } }