Παράδειγμα: επεξεργασία αρχείων καταγραφής πρόσβασης σε ιστοσελίδες


/*
 * Collect and print Web statistics
 * D. Spinellis, 2004-2024
 */

import java.util.*;
import java.util.regex.*;
import java.io.*;

class WebStats {

    /**
     * Increment the integer value of map's member by 1
     * The member is obtained by using the matcher to extract
     * the specified group from the string s
     */
    static void increment(Map<String, Integer> map, String s, Matcher m, int group) {
	String member = s.substring(m.start(group), m.end(group));
	Integer i = map.get(member);
        map.put(member, i == null ? 1 : i + 1);
    }

    /** List the contents of the given map */
    static void list(String title, Map<String, Integer> map) {
	System.out.println("\n" + title);
	for (Map.Entry e : map.entrySet())
	    System.out.println(e.getValue() + " " + e.getKey());
    }

    /** List the contents of the given map ordered by their values.
     * (You are not expected to undestand this).
     */
    static void sortedList(String title, Map<String, Integer> map) {
	System.out.println("\n" + title);
	var valueOrder = new TreeSet<Map.Entry<String, Integer>>(new
	    Comparator<Map.Entry<String, Integer>>() {
		public int compare(Map.Entry<String, Integer> a,
			Map.Entry<String, Integer> b) {
		    return (-a.getValue().compareTo(b.getValue()));
		}
	    }
	);
	valueOrder.addAll(map.entrySet());
	for (Map.Entry e : valueOrder)
	    System.out.println(e.getValue() + " " + e.getKey());
    }



    public static void main(String args[]) {
	if (args.length != 1) {
	    System.err.println("Usage: WebStats file");
	    System.exit(1);
	}

	Pattern cre = null;	// Compiled RE
	try {
	    // A standard log line is a line like:
	    // 192.168.136.16 - - [26/Jan/2004:19:45:48 +0200] "GET /c136.html HTTP/1.1" 200 1674 "http://office/c120.html" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.5) Gecko/20031007"
	    String patternString = """
                ([-\\w.]+)\\s+      # 1. Host
                ([-\\w]+)\\s+       # 2. Logname
                ([-\\w]+)\\s+       # 3. User
                \\[(\\d+)/          # 4. Date
                (\\w+)/             # 5. Month
                (\\d+):             # 6. Year
                (\\d+):             # 7. Hour
                (\\d+)              # 8. Minute
                ([^]]+?)\\]\\s+     # 9. Rest of time
                \"([-\\w]+)\\s*     # 10. Request verb
                ([^\\s]*)           # 11. Request URL
                ([^\"]*?)\"\\s+     # 12. Request protocol etc.
                (\\d+)\\s+          # 13. Status
                ([-\\d]+)\\s+       # 14. Bytes
                \"([^\"]*)\"\\s+    # 15. Referrer URL
                \"([^\"]*)\"        # 16. Client
	    """;
	    cre = Pattern.compile(patternString, Pattern.COMMENTS);
	} catch (PatternSyntaxException e) {
	    System.err.println("Invalid RE syntax: " + e.getDescription());
	    System.exit(1);
	}

	BufferedReader in = null;
	try {
	    in = new BufferedReader(new InputStreamReader(new FileInputStream(args[0])));
	} catch (FileNotFoundException e) {
	    System.err.println("Unable to open file " + args[1] + ": " + e.getMessage());
	    System.exit(1);
	}

	var host = new HashMap<String, Integer>();
	var hour = new HashMap<String, Integer>();
	var request = new HashMap<String, Integer>();
	var referrer = new HashMap<String, Integer>();
	try {
	    String s;
	    while ((s = in.readLine()) != null) {
		Matcher m = cre.matcher(s);
		if (!m.matches())
		    System.out.println("Invalid line: " + s);
		else {
		    increment(host, s, m, 1);
		    increment(hour, s, m, 7);
		    increment(request, s, m, 11);
		    increment(referrer, s, m, 15);
		}
	    }
	} catch (Exception e) {
	    System.err.println("Error reading line: " + e.getMessage());
	    System.exit(1);
	}
	sortedList("Host Access Counts", host);
	sortedList("Hourly Access Counts", hour);
	sortedList("Request URL Access Counts", request);
	sortedList("Referrer URL Access Counts", referrer);
    }
}