Friday, July 29, 2005

Java AWK - parsing csv-files in Java can be fun

Ever parsed text-files, writing loops and calling indexOf / substr just to get some boring job done? I confess, I did. Today, instead, I sat back and browsed the jdk-JavaDocs. Thinking what it would take to bring some of the simplicity of AWK to Java - and it'really simple - just a few lines of code and you can write things like this:

public class JawkSample extends JawkProgram {

protected void registerRules() {
addRules( new Rule[] {

new Rule( "^Q\\d",
new Command(){ protected void execute() {
print( "\nName: " + f(3));
}} ),

new Rule( "^P",
new Command(){ protected void execute() {
print("\n ignoring line "+ line() );
}} ),

});

}

public static void main(String[] args) throws IOException {
FileInputStream in = new FileInputStream("test1.txt");
JawkSample prog = new JawkSample();

prog.process( in, System.out );

}
}

Ok, anonymous classes are not nearly as nice and pretty as closures a la Ruby and Python. But it does the job, cleanly and nicely. Here is the base class - it just needs rt.jar - nothing else. I never noticed how much Regexp-support improved from 1.3 to 1.4. So it is possible to easily create small domain-specific languages in Java.

package com.bmw.jawk;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Instanzen der Klasse JawkProgram "J-AWK-Programme", AWK-artige
* Programme zur Verarbeitung von Text-Streams in Java.
*
* @author schultma
*/
public abstract class JawkProgram {
protected String fieldSeparator = ";";
protected Pattern splitPattern;

private List /*of Rule*/ rules = new ArrayList();

private BufferedReader input;
private BufferedWriter output;

private class Line {
private String content;
private String[] fields;

public Line ( String content ) {
this.content = content;
}

public String getContent(){
return content;
}

public String field( int i ) {
if (fields==null)
fields = splitPattern.split( content );
return fields[i];
}

public void process() {
for ( Iterator ruleIterator = rules.iterator(); ruleIterator.hasNext(); ) {
Rule curRule = (Rule) ruleIterator.next();
curRule.applyTo( this );
}
}
}

protected abstract class Command {
private Line line;

protected void print( String s ) {
try {
output.write( s );
} catch ( IOException e ) {
throw new RuntimeException(e);
}
}

protected String f(int i) {
return line.field(i);
}

protected String line() {
return line.getContent();
}

public final void executeOn( Line l ) {
line = l;
execute();
}
abstract protected void execute();
}

protected class Rule {
private Matcher matchPattern;
private Command command;

public Rule( String pattern, Command command ) {
this.matchPattern = Pattern.compile( pattern ).matcher("");
this.command = command;
}

public void applyTo( Line line ) {
matchPattern.reset( line.getContent() );
if ( matchPattern.find() ) {
if ( command==null ) {
command.print( line.getContent() );
} else {
command.executeOn( line );
}
}
}
}

protected void addRule( Rule rule ) {
rules.add( rule );
}

protected void addRules( Rule[] r ) {
rules.addAll( Arrays.asList(r) );
}

protected abstract void registerRules();

public JawkProgram () {
registerRules();
}

public void process( InputStream in, OutputStream out ) {
input = new BufferedReader( new InputStreamReader(in) );
output = new BufferedWriter( new OutputStreamWriter(out) );

splitPattern = Pattern.compile( fieldSeparator );

try {

String currentLine;
while ( (currentLine = input.readLine()) != null ) {
Line l = new Line(currentLine);
l.process();
}
output.flush();

} catch ( IOException e ) {
throw new RuntimeException(e);
}
}


}

2 comments:

Anonymous said...

This is cool!

Anonymous said...

This doesnt seem to bwe wrokgin. I trid having test1.txt with some data. No results.