User:MER-C/Spamsearch.java
Appearance
/**
* @(#)Spamsearch.java 0.02 23/10/2007
* Copyright (C) 2007 MER-C
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.concurrent.*;
import java.util.logging.*;
import javax.swing.*;
import java.util.regex.*;
/**
* Searches all Wikimedia wikis for spam. Usage: <tt>java Spamsearch
* example.com example.net ...</tt>, where example.com and example.net are the
* sites spammed. Outputs the results to a text file in the current directory
* (i.e. <tt>results.txt</tt>)
*
* Requires Wiki.java 0.11.
*
* KNOWN ISSUES: multi-site search does not work for some reason.
*
* @author MER-C
* @version 0.02
*/
public class Spamsearch
{
private ArrayList<Wiki> wikis = new ArrayList(1333);
private PrintWriter out; // output file
private ProgressMonitor monitor; // progress monitor
private int progress = 0;
private int hits = 0; // number of links found
public static void main(String[] args) throws IOException
{
new Spamsearch(args);
}
private Spamsearch(String[] args)
{
// check if command line arguments were specified
if (args.length == 0)
{
String sites = JOptionPane.showInputDialog(null, "Enter sites to search");
args = sites.split("\\s");
}
try
{
// various initialisation
out = new PrintWriter(new FileWriter("results.txt"));
out.println("Starting spamsearch at " + new Date() + ".");
// suppress log records below INFO
Logger.getLogger("wiki").setLevel(Level.INFO);
// fetch site matrix
Logger.getLogger("wiki").info("Fetching site matrix.");
InputStream in = new URL("http://en.wikipedia.org/w/api.php?action=sitematrix&format=xml").openStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
String line = reader.readLine();
// private wikis have API disabled and are NOT GOOD.
// Current private wikis are anything containing "com." or ".en." and
// (board|chair|exec|grants|internal|office|otrs-wiki|tlh|wikimaniateam).wikimedia.org.
String pattern = "(com\\.|\\.en\\.|board|chair|exec|grants|internal|office|otrs|tlh|wikimaniateam)";
Pattern p = Pattern.compile(pattern);
// parse the list
while (line.contains("url=\""))
{
int a = line.indexOf("url=\"") + 12;
int b = line.indexOf("\"", a) - 1;
String domain = line.substring(a, b);
Matcher matcher = p.matcher(domain);
if (matcher.find()) // private wiki, WOOP WOOP WOOP
{
line = line.substring(b);
continue;
}
Wiki wiki = new Wiki(domain);
wikis.add(wiki);
line = line.substring(b);
}
// now do the searching
for (int i = 0; i < args.length; i++)
{
// reset progress monitor
monitor = new ProgressMonitor(new JFrame(), "Searching for spamlink ", args[i], 0, wikis.size());
// resolve the website
InetAddress[] addresses = InetAddress.getAllByName(args[i]);
for (int j = 0; j < addresses.length; j++)
out.println(addresses[j]);
out.println("Searching " + wikis.size() + " wikis.\n");
// search for links
for (int j = 0; j < wikis.size(); j++)
{
newThread("*." + args[i], j);
if (j % 16 == 15) // wait for a while
Thread.sleep(8500);
}
synchronized(out)
{
out.wait();
Thread.sleep(7500);
out.println("" + hits + " links found.\n");
}
// recycle monitor
monitor.close();
monitor = null;
progress = 0;
}
}
catch (Exception ex)
{
if (!(ex instanceof InterruptedException))
{
ex.printStackTrace();
System.exit(2);
}
}
synchronized (out)
{
out.close();
}
System.exit(0);
}
/**
* Speed optimisation (runtime approx 4200s beforehand) because the
* internet is the major limitation. Don't you just love multithreading?
*/
private void newThread(final String domain, final int i)
{
new Thread()
{
public void run()
{
Wiki wiki = wikis.get(i);
wiki.setMaxLag(-1); // disable maxlag for performance
try
{
// do spamsearch
ArrayList[] links = wiki.spamsearch(domain);
hits += links[0].size();
synchronized(out) // so the output file doesn't get messed up
{
// don't print anything if there are no results
if (!links[0].isEmpty())
{
out.println("Results for " + wiki.getDomain() + "...");
for (int k = 0; k < links[0].size(); k++)
out.println("Page: " + links[0].get(k) + " URL: " + links[1].get(k));
out.println();
}
// done spamsearching
if (i == wikis.size() - 1)
{
out.flush();
out.notifyAll();
}
}
}
catch (IOException ex)
{
System.err.println(ex);
out.flush();
System.exit(2);
}
// update the progress monitor
SwingUtilities.invokeLater(new Runnable()
{
public void run()
{
if (monitor != null)
monitor.setProgress(++progress);
}
});
}
}.start();
}
}