Jump to content

User:Rick Bot/scripts/gethopefuls

From Wikipedia, the free encyclopedia
#!/bin/bash

WGET="/usr/bin/curl"  # on a mac OS X
# WGET="wget -q -O -"   # on a linux box with wget

# files 
ACTIVE="hopeful.active"
SEMIACTIVE="hopeful.semi-active"
INACTIVE="hopeful.inactive"
CONTRIBS=".contrib.times"
ALLRFAS="rfas"
RFAS=".rfas"
WPHopefuls="wphopefuls"

function prevmonth () {
  case $1 in
    January) echo "December";;
    February) echo "January";;
    March) echo "February";;
    April) echo "March";;
    May) echo "April";;
    June) echo "May";;
    July) echo "June";;
    August) echo "July";;
    September) echo "August";;
    October) echo "September";;
    November) echo "October";;
    December) echo "November";;
  esac
}

function inactive () {
  # $1 is day number of latest contrib
  # $2 is month of latest contrib
  # $3 is year of latest contrib
  # $4 - $6 are day, month, year for today


  # if no contribs, arg count is not 6
  [ $# -ne 6 ] && return 0
  
  # if latest contrib is this month, not inactive
  [ $2 = $5 -a $3 = $6 ] && return 1

  # if latest contrib is last month, not inactive
  MONTH=`prevmonth $5`
  YEAR=$6
  [ $MONTH = "December" ] && let YEAR=$YEAR-1
  [ $2 = $MONTH -a $3 = $YEAR ] && return 1

  # if latest contrib is two months ago, not inactive
  MONTH=`prevmonth $MONTH`
  [ $MONTH = "December" ] && let YEAR=$YEAR-1
  [ $2 = $MONTH -a $3 = $YEAR ] && return 1

  # if latest contrib is less than three months ago, not inactive
  MONTH=`prevmonth $MONTH`
  [ $MONTH = "December" ] && let YEAR=$YEAR-1
  [ $2 = $MONTH -a $3 = $YEAR -a $1 -gt $4 ] && return 1

  return 0
}

function semiactive () {
  # $1 is day number of 30th most recent contrib
  # $2 is month of 30th most recent contrib
  # $3 is year of 30th most recent contrib
  # $4 - $6 are day, month, year for today

  # if 30th most recent contrib is this month, not semi-active
  if [ $2 = $5 -a $3 = $6 ]; then
    return 1
  fi

  # if 30th most recent contrib is last month, not semi-active
  MONTH=`prevmonth $5`
  YEAR=$6
  [ $MONTH = "December" ] && let YEAR=$YEAR-1
  [ $2 = $MONTH -a $3 = $YEAR ] && return 1

  # if 30th most recent contrib is less than two months ago, not semi-active
  MONTH=`prevmonth $MONTH`
  [ $MONTH = "December" ] && let YEAR=$YEAR-1
  [ $2 = $MONTH -a $3 = $YEAR -a $1 -gt $4 ] && return 1

  return 0
}

TODAY=`date +"%e %B %Y"`

rm -f $ACTIVE
rm -f $INACTIVE
rm -f $SEMIACTIVE

let n=1
./listcat Wikipedia_administrator_hopefuls | grep "^User" >hopefuls
HOPEFULSSIZE=`cat hopefuls | wc -l`
if [ $HOPEFULSSIZE -lt 500 ]; then
  ./listcat Wikipedia_administrator_hopefuls | grep "^User" >hopefuls
fi

HOPEFULSSIZE=`cat hopefuls | wc -l`
if [ $HOPEFULSSIZE -lt 500 ]; then
  echo "Can't fetch hopefuls list!"
  exit 0
fi

# make a list of current admins
> alreadyadmins
cat ../Admins/A* ../Admins/G* ../Admins/P* ../Admins/Semi-active ../Admins/Inactive | grep "# {{user3" | sed -e "s/# {{user3.//" -e "s/}}.*//" >adminlist
LINES=`cat adminlist | wc -l`
LINES=`expr $LINES`
if [ "$LINES" -lt 1000 ]; then
  echo "Don't have good list of admins"
  exit 0
fi


# get a list of all RFAs
./getallpages "Requests_for_adminship" 4 >$ALLRFAS
LINES=`cat $ALLRFAS | wc -l`
LINES=`expr $LINES`
if [ "$LINES" -lt 3000 ]; then
  ./getallpages "Requests_for_adminship" 4 >$ALLRFAS
  LINES=`cat $ALLRFAS | wc -l`
  LINES=`expr $LINES`
  if [ "$LINES" -lt 2000 ]; then
    echo "Problem fetching RFAs" >&2
    exit
  fi
fi

last=""
cat hopefuls | while read line; do

  realname="${line##User:}"
  realname="${realname##User talk:}"
  realname="${realname%%/*}"
  if [ "$realname" = "$last" ]; then
    continue
  fi

  last="$realname"
  urlname=`./urlencode "$realname"`

  grep "^$realname$" adminlist >/dev/null
  if [ $? -eq 0 ]; then
    echo "* {{admin|" $realname "}}" >>alreadyadmins
    continue
  fi

  # echo $realname
  # echo $urlname

  # get previous RFAs
  if [ "$1" != "-" -o ! -f "$RFAS.$urlname" ]; then
    cat $ALLRFAS | egrep "/$realname[ 0-9]*$" >$RFAS.$urlname
  fi

  # figure  out if user is active based on contribs we already know about
  if [ -s $CONTRIBS.$urlname ]; then
    LATEST=`head -1 $CONTRIBS.$urlname | cut -c8-`
    THIRTIETH=`tail -1 $CONTRIBS.$urlname | cut -c8-`
    inactive $LATEST $TODAY 
    if [ $? -ne 0 ]; then
      # not inactive, how about semiactive?
      semiactive $THIRTIETH $TODAY
      if [ $? -ne 0 ]; then
        if [ ! -s "$CONTRIBS.$urlname.earliest" ]; then
          $WGET "http://en.wikipedia.org/w/index.php?title=Special:Contributions&target=$urlname&limit=30&dir=prev" | grep "<li cl[^>]*><a href" | sed -e 's/^<li[^>]*><a href[^>]*>//' -e 's/<.*//' >$CONTRIBS.$urlname.earliest
        fi
        EARLIEST=`tail -1 $CONTRIBS.$urlname.earliest | cut -c8-`
        echo "$realname || $EARLIEST" >>$ACTIVE
        continue
      fi
    fi
  fi

  # get latest contribs
  let n=n+1
  if [ $n -ge 10 -a "$1" != "-" ]; then
      echo $realname
      sleep 10
      let n=1
  fi


  if [ "$1" != "-" -o ! -s $CONTRIBS.$urlname ]; then
    echo $WGET "http://en.wikipedia.org/w/index.php?title=Special:Contributions&target=$urlname&limit=30" 
    $WGET "http://en.wikipedia.org/w/index.php?title=Special:Contributions&target=$urlname&limit=30" | grep "<li cl[^>]*><a href" | sed -e 's/^<li[^>]*><a href[^>]*>//' -e 's/<.*//' >$CONTRIBS.$urlname
  fi

  LATEST=`head -1 $CONTRIBS.$urlname | cut -c8-`
  THIRTIETH=`tail -1 $CONTRIBS.$urlname | cut -c8-`

  if [ ! -s "$CONTRIBS.$urlname.earliest" ]; then
    $WGET "http://en.wikipedia.org/w/index.php?title=Special:Contributions&target=$urlname&limit=30&dir=prev" | grep "<li cl[^>]*><a href" | sed -e 's/^<li[^>]*><a href[^>]*>//' -e 's/<.*//' >$CONTRIBS.$urlname.earliest
  fi
  EARLIEST=`tail -1 $CONTRIBS.$urlname.earliest | cut -c8-`
  
  # inactive if LATEST contrib not within last three months
  # semi-active if 30th most recent contrib is more than two months ago
  inactive $LATEST $TODAY 
  if [ $? -eq 0 ]; then
    case "$urlname" in
      *) echo "$realname || $LATEST" >>$INACTIVE;;
    esac
    continue
  fi
  semiactive $THIRTIETH $TODAY
  if [ $? -eq 0 ]; then
    echo "$realname || $LATEST" >>$SEMIACTIVE
  else
    echo "$realname || $EARLIEST" >>$ACTIVE
  fi
done

# fix the format and sort order
sort -fdu $ACTIVE >tmp.$ACTIVE

( grep "[|][^a-zA-Z]" tmp.$ACTIVE ; grep "[|][a-zA-Z]" tmp.$ACTIVE) | awk >$ACTIVE '

BEGIN {
  FS=" \\|\\| "
  monthabbr["January"] = "Jan"
  monthabbr["February"] = "Feb"
  monthabbr["March"] = "Mar"
  monthabbr["April"] = "Apr"
  monthabbr["May"] = "May"
  monthabbr["June"] = "Jun"
  monthabbr["July"] = "Jul"
  monthabbr["August"] = "Aug"
  monthabbr["September"] = "Sep"
  monthabbr["October"] = "Oct"
  monthabbr["November"] = "Nov"
  monthabbr["December"] = "Dec"
}

{
  # add "1=" to template invocation for User:Until(1 == 2)
  if ( $0 ~ "=" ) {
     sub("user.","user|1=",$0)
  }
  print "|-"
  split($2,date," ")
  if (date[1] < 10 ) {
    date[1] = "0" date[1]
  }
  printf ("%s",  "| {{user20|" $1 "}} || " date[1]  "-" monthabbr[date[2]] "-" date[3] " ||" )
  rfanum = 1
  rfafile = ".rfas." $1
  rfa=""
  getline rfa < rfafile
  while (rfa != "") {
    printf ("%s"," [[" rfa "|" rfanum "]]" )
    rfa=""
    getline rfa <rfafile
    rfanum = rfanum + 1
  }
  close(rfafile)
  printf "\n"
}'

# fix the format and sort order
sort -fdu $INACTIVE $SEMIACTIVE >tmp.$INACTIVE

( grep "[|][^a-zA-Z]" tmp.$INACTIVE ; grep "[|][a-zA-Z]" tmp.$INACTIVE) | awk >$INACTIVE '

BEGIN {
  FS=" \\|\\| "
  monthabbr["January"] = "Jan"
  monthabbr["February"] = "Feb"
  monthabbr["March"] = "Mar"
  monthabbr["April"] = "Apr"
  monthabbr["May"] = "May"
  monthabbr["June"] = "Jun"
  monthabbr["July"] = "Jul"
  monthabbr["August"] = "Aug"
  monthabbr["September"] = "Sep"
  monthabbr["October"] = "Oct"
  monthabbr["November"] = "Nov"
  monthabbr["December"] = "Dec"
}

{
   # add "1=" to template invocation for User:Until(1 == 2)
   if ( $0 ~ "=" ) {
      sub("user.","user|1=",$0)
   }
   print "|-"
   if ( $2 == "" ) {
     print "| {{user20|" $1 "}} || No edits ||"
   } else {
     split($2,date," ")
     if (date[1] < 10 ) {
       date[1] = "0" date[1]
     }
     printf ("%s",  "| {{user20|" $1 "}} || " date[1]  "-" monthabbr[date[2]] "-" date[3] " ||" )
     rfanum = 1
     rfafile = ".rfas." $1
     rfa=""
     getline rfa < rfafile
     while (rfa != "") {
       printf ("%s"," [[" rfa "|" rfanum "]]" )
       rfa=""
       getline rfa <rfafile
       rfanum = rfanum + 1
     }
    close(rfafile)
    printf "\n"
  }
}'

# get the current contents of WP:admin_hopefuls
$WGET 'http://en.wikipedia.org/w/index.php?title=Wikipedia:List_of_administrator_hopefuls&action=raw' >$WPHopefuls
echo >>$WPHopefuls

# remove dups from alreadyadmins
sort -fdu alreadyadmins  >tmp.alreadyadmins
mv tmp.alreadyadmins alreadyadmins

awk <$WPHopefuls >tmp.$WPHopefuls '

/^== / {
  skip = 0
  if (file == "") {
    file = "hopeful.active"
  } else if (file == "hopeful.active") {
    file = "hopeful.inactive"
  } else {
    file = "alreadyadmins"
  }
}

/^== Users who are already/ {
  print $0
  skip = 1
  next
}

/^[{][|]/ {
  print $0

  # next line should be "|-"
  getline
  print $0
  
  # next line is the table header
  getline
  print $0

  skip = 1
  next
}

/^[|]}/ {
  skip = 0
  # and now print the table contents
  while ( ( getline <file ) > 0 ) {
    print $0
  }
  print "|}"
  next
}

/^[|*]/ {
  if (skip == 1) {
    next
  }
}

/^None./ {
  if (skip == 1) {
    next
  }
}

{
  if (skip == 1) {
    skip = 0
    line = $0
    if (file == "alreadyadmins") {
      none="None."
      while ( ( getline <file ) > 0 ) {
        none=""
        print $0
      }
      if (none == "None.") {
        print "None."
      }
    }
    $0 = line
    
  }
  print $0
}
'