ElectionScraper.java


Below is the syntax highlighted version of ElectionScraper.java from §3.5 Inheritance.


/******************************************************************************
 *  Compilation:  javac ElectionScraper.java
 *  Execution:    java ElectionScraper year < codes.csv
 *  Dependencies: StdIn.java Out.java
 *                https://introcs.cs.princeton.edu/java/data/codes.csv
 *
 *  Screen scrape election result data from http://uselectionatlas.org
 *  URLs of the form
 *
 *      http://uselectionatlas.org/RESULTS/datagraph.php?year=2004&fips=34
 *
 *  where 2004 is the election year and 34 is the FIPS code of the
 *  desired state (34 = New Jersey). The file codes.csv is used to
 *  convert from state abbreviation to FIPS code.
 *
 *  The resulting .php file has the following general format,
 *  which we rely on to screen scrape the data:
 *
 *  width:100px ...
 *  <b>County name</b> ...
 *  <td class="dat">Republican vote count</td> ...
 *  <td class="dat">Democrat vote count</td> ...
 *  <td class="dat">Third party vote count</td> ...
 *
 *  <b>County name</b> ...
 *  <td class="dat">Republican vote count</td> ...
 *  <td class="dat">Democrat vote count</td> ...
 *  <td class="dat">Third party vote count</td> ...
 *
 *  ...
 *
 *
 *  This program creates 51 files of the form NJ2004.txt. The first
 *  line of the file contains the name of the election and the candidates.
 *  Each succeeding line contains a list of counties and the number of
 *  corresponding Republican, Democrat, and Third party votes.
 *  This program also creates a file of the form USA2004.txt with
 *  the cumulative totals for each state in the given election.
 *
 *  % java ElectionScraper 2004 < csv.txt
 *
 *  % more NJ2004.txt
 *  2004 Presidential Election,Bush,Kerry,Other,
 *  Atlantic,49487,55746,864,
 *  Bergen,189833,207666,2745,
 *  Burlington,95936,110411,1609,
 *  Camden,81427,137765,1741,
 *  Cape May,28832,21475,455,
 *  Cumberland,24362,27875,948,
 *  Essex,83374,203681,2293,
 *  Gloucester,60033,66835,1096,
 *  Hudson,60646,127447,1353,
 *  Hunterdon,39888,26050,742,
 *  Mercer,56604,91580,1326,
 *  Middlesex,126492,166628,2685,
 *  Monmouth,163650,133773,2516,
 *  Morris,135241,98066,1847,
 *  Ocean,154204,99839,2263,
 *  Passaic,75200,94962,1149,
 *  Salem,15721,13749,311,
 *  Somerset,72508,66476,1295,
 *  Sussex,44506,23990,900,
 *  Union,82517,119372,1498,
 *  Warren,29542,18044,622,
 *
 *
 *  Remarks
 *  -------
 *    - In some elections there are 4 candidates (e.g., Clinton, Dole,
 *      Perot, Other) but this program only uses first three.
 *
 ******************************************************************************/


public class ElectionScraper {

    public static void main(String[] args) {

        // year of presidential election
        int year = Integer.parseInt(args[0]);

        // scrape the data
        while (!StdIn.isEmpty()) {
            String line = StdIn.readLine();
            String[] fields = line.split(",");
            String state = fields[0];
            String usps  = fields[1];
            String fips  = fields[2];

            System.err.printf("State = %s, usps = %s, fips = %s\n", state, usps, fips);

            // screen scrape from uselectionatlas.org
            String url = "http://uselectionatlas.org/RESULTS/datagraph.php";
            In in = new In(url + "?year=" + year + "&fips=" + fips);

            String input = in.readAll();

           // when data is not available
           if (input.contains("<b>Warning</b>")) {
               System.err.println("Error reading: " + state + " " + usps + " " + fips);
               continue;
           }

            // save results to this file
            Out file = new Out(usps + year + ".txt");

            // indices into string
            int p = 0;
            int from, to;

            // extract names of 3 candidates
            p  = input.indexOf("width:100px", p);

            // name of Democratic candidate
            p  = input.indexOf("<td class=\"cnd\"", p);
            from = input.indexOf(">", p);
            to   = input.indexOf("</td>", from);
            String candidate1 = input.substring(from + 1, to);

            // name of Republican candidate
            p = to + 1;
            from = input.indexOf("<tr><td>", p);
            to   = input.indexOf("</td>", from);
            String candidate2 = input.substring(from + 8, to);

            // name of third party candidate
            p = to + 1;
            from = input.indexOf("<tr><td>", p);
            to   = input.indexOf("</td>", from);
            String candidate3 = input.substring(from + 8, to);

            // check if there is a 3rd-party candidate
            if (candidate3.equals(candidate2)) {
                StdOut.println("No 3rd party candidate");
                candidate3 = "";
            }

            file.println(year + " US Presidential Election,"
                          + candidate2 + "," + candidate1 + "," + candidate3 + ",");



            // read in vote tallies for each county in the given state
            p = 0;
            while (true) {

                // county name occurs after width:100px label
                p  = input.indexOf("width:100px", p);
                if (p == -1) break;

                // extract county name
                from = input.indexOf("<b>", p);
                to   = input.indexOf("</b>", from);
                if (from == -1 || to == -1) {
                    System.err.println("Check results");
                    break;
                }
                String county = input.substring(from + 3, to);

                // extract number of votes for democratic candidate
                p = to + 1;
                p    = input.indexOf("<td class=\"dat\"", p);
                from = input.indexOf(">", p);
                to   = input.indexOf("</td>", from);
                if (p == -1 || from == -1 || to == -1) {
                    System.err.println("Check results");
                    break;
                }
                String democrat = input.substring(from + 1, to);     // number of votes
                democrat = democrat.replaceAll(",", "");             // remove any commas

                // extract number of votes for republican candidate
                p    = to + 1;
                p    = input.indexOf("<td class=\"dat\"", p);
                from = input.indexOf(">", p);
                to   = input.indexOf("</td>", from);
                if (p == -1 || from == -1 || to == -1) {
                    System.err.println("Check results");
                    file.println(county + "0," + democrat + ",0,");
                    break;
                }
                String republican = input.substring(from + 1, to);
                republican = republican.replaceAll(",", "");

                // extract number of votes for third party candidate, if exists
                p    = to + 1;
                int p1 = input.indexOf("<td class=\"dat\"", p);
                int p2 = input.indexOf("width:100px", p);

                // no third party candidate
                // also works if p1 = p2 = -1
                if ((p1 == -1) || (p2 >= 0 && p2 <= p1)) {
                    System.err.println("No third party candidate in " + county + ", " + state);
                    file.println(county + "," + republican + "," + democrat + ",0,");
                }

                // third party candidate
                else {
                    p = p1;
                    from = input.indexOf(">", p);
                    to   = input.indexOf("</td>", from);
                    if (p == -1 || from == -1 || to == -1) {
                        System.err.println("Check results");
                        break;
                    }
                    String independent = input.substring(from + 1, to);
                    independent = independent.replaceAll(",", "");
                    file.println(county + "," + republican + "," + democrat + "," + independent + ",");
                }


            }
            file.close();
        }
    }

}


Copyright © 2000–2022, Robert Sedgewick and Kevin Wayne.
Last updated: Thu Aug 11 10:27:11 EDT 2022.