Monday, August 02, 2010

Turning functional code into great code: Part 3: Duplication and third-party APIs

Hello, and welcome back! This is part 3 of an ongoing series on turning functional code into great code. If you haven't read part 1 or part 2, go ahead and do so, then come on back!

Last time, we saw how to improve the StylesheetCreator class using standard Java language features and APIs. This time we'll go a little further and see what benefits certain third-party APIs can give us. But first, let's finally take a look at that StylesheetHelper class that we've heard so much about. Warning, it's long -- you may just want to skim it, then refer back to it as we're refactoring it.


package net.galluzzo.example.stylesheetbuilder.refactored.part3;

public class StylesheetHelper
{
private String ruleCondition = "";
private String childNodes = "";
private String rejectedChildNodes = "";
private String parentNodeName = "";
private String childNodeNames = "";
private String rejectNodeContents = "";

public String getStylesheetHeader( String parentNodeText,
String childNodeList )
{
parentNodeName = parentNodeText;
childNodeNames = childNodeList;
String stylesheetHeader =
"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>"
+ "<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\" xmlns:ns1=\"http://www.galluzzo.net/example/stylesheet-builder\" xmlns:java=\"net.galluzzo.example.stylesheetbuilder.Directives\" exclude-result-prefixes=\"java\">"
+ "<xsl:output method=\"xml\" indent=\"yes\" encoding=\"UTF-8\"/>"
+ "<xsl:template match=\"/\">"
+ "<" + parentNodeText + "s>"
+ "<xsl:for-each select=\"ns1:" + parentNodeText + "s/"
+ parentNodeText + "\">";

return stylesheetHeader;
}

public String getStylesheetFooter( String parentNodeText )
{
String stylesheetFooter = "</xsl:for-each></" + parentNodeText + "s>"
+ "</xsl:template></xsl:stylesheet>";

return stylesheetFooter;
}

private void setRejectedNodeContents( String nodeName, long param,
String ruleText )
{
rejectNodeContents +=
"<xsl:value-of select=\"' Rejected: '\" />"
+ "<xsl:value-of select=\"" + nodeName + "\" />"
+ "<xsl:value-of select=\"'" + ruleText + param + "'\" />";
}

private void setRejectedNodeContents( String nodeName, String param,
String ruleText )
{
rejectNodeContents += "<xsl:value-of select=\"' Rejected: '\" />"
+ "<xsl:value-of select=\"" + nodeName + "\" />"
+ "<xsl:value-of select=\"'" + ruleText + param + "'\" />";
}

private void setRejectedNodeContents( String nodeName, int max,
String ruleText, int min )
{
rejectNodeContents += "<xsl:value-of select=\"' Rejected: '\" />"
+ "<xsl:value-of select=\"" + nodeName + "\" />"
+ "<xsl:value-of select=\"'" + ruleText + min + " and " + max
+ "'\" />";
}

private void setRejectedNodeContents( String nodeName, String max,
String ruleText, String min )
{
rejectNodeContents +=
"<xsl:value-of select=\"' Rejected: '\" />"
+ "<xsl:value-of select=\"" + nodeName + "\" />"
+ "<xsl:value-of select=\"'" + ruleText + min + " and " + max
+ "'\" />";
}

private void setRuleCondition( boolean result, int ruleIndex )
{
String clause = "($result" + ruleIndex + "='" + result + "')";

if ( ruleCondition == "" )
{
ruleCondition = clause;
System.out.println( "ruleCondition = " + ruleCondition );
}
else
{
ruleCondition += " and " + clause;
System.out.println( "ruleCondition = " + ruleCondition );
}
}

private void setRejectedNode( String rejectedNodeText )
{
if ( !rejectedChildNodes.contains( "<" + rejectedNodeText + ">" ) )
{
rejectedChildNodes += "<" + rejectedNodeText
+ "><xsl:value-of select=\"" + rejectedNodeText + "\"/>" + "</"
+ rejectedNodeText + ">";
}
}

private void setNodeText( String nodeText )
{
System.out.println( "nodeText = " + nodeText );
if ( !childNodes.contains( "<" + nodeText + ">" )
&& !( nodeText.equals( "" ) ) )
{
childNodes += "<" + nodeText + "><xsl:value-of select=\"" + nodeText
+ "\"/>" + "</" + nodeText + ">";
}
}

private void setNodeText( String nodeText, int resultVarIndex )
{
childNodes += "<" + nodeText + "><xsl:value-of select=\"$result"
+ resultVarIndex + "\"/>" + "</" + nodeText + ">";
}

public String getXsltChooseStart()
{
String strChooseStart = "<xsl:choose><xsl:when test = \""
+ ruleCondition + "\">";
System.out.println( "ruleCondition = " + ruleCondition );

return strChooseStart;
}

public String getXsltWhenEnd()
{
String strXsltWhenEnd = "</xsl:when>";

return strXsltWhenEnd;
}

public String getXsltChooseEnd()
{
String strXsltChooseEnd = "</xsl:otherwise></xsl:choose>";

return strXsltChooseEnd;
}

public String getXsltOtherwiseStart()
{
String strXsltOtherwiseStart = "<xsl:otherwise>";

return strXsltOtherwiseStart;
}

public String getParentNode()
{
System.out.println( "childNodeNames = " + childNodeNames );
if ( !childNodeNames.equals( "" ) )
{
prepareChildNodes( childNodeNames );
}
String strBodyText = "<" + parentNodeName + ">" + childNodes + "</"
+ parentNodeName + ">";

return strBodyText;
}

public String getRejectedParentNode()
{
prepareRejectedChildNodes( childNodeNames );
String strBodyText = "<" + parentNodeName + "_Rejected" + ">"
+ rejectedChildNodes + "<Reject>" + rejectNodeContents
+ "</Reject></" + parentNodeName + "_Rejected" + ">";

return strBodyText;
}

private void prepareChildNodes( String childNodeNameList )
{
System.out.println( "Inside prepareChildNodes" );
String[] strList = childNodeNameList.split( "," );
for ( int i = 0; i < strList.length; i++ )
{
setNodeText( strList[i] );
}
}

private void prepareRejectedChildNodes( String childNodeNameList )
{
System.out.println( "Inside prepareRejectedChildNodes" );
String[] strList = childNodeNameList.split( "," );
for ( int i = 0; i < strList.length; i++ )
{
setRejectedNode( strList[i] );
}
}

// Rule methods

public String concatRule( int ruleIndex, String varNodeName,
String param1 )
{
System.out.println( "In concatRule" );
if ( ruleCondition == "" )
{
ruleCondition = "1";
}

String concatText = "<xsl:variable name=\"var" + ruleIndex +
"\" select=\"" + varNodeName + "\"/>"
+ "<xsl:variable name=\"result" + ruleIndex
+ "\" select=\" java:concat($var" + ruleIndex + ",'" + param1
+ "')\"/>";

System.out.println( "concatText = " + concatText );

setNodeText( varNodeName, ruleIndex );
setRejectedNode( varNodeName );
return concatText;
}

public String replaceRule( int ruleIndex, String oldValue,
String varNodeName, String newValue )
{
if ( ruleCondition == "" )
{
ruleCondition = "1";
}

String concatText = "<xsl:variable name=\"var" + ruleIndex
+ "\" select=\"" + varNodeName + "\"/>"
+ "<xsl:variable name=\"result" + ruleIndex
+ "\" select=\" java:replace('" + oldValue + "'," + "$var"
+ ruleIndex + ",'" + newValue + "')\"/>";

setNodeText( varNodeName, ruleIndex );
setRejectedNode( varNodeName );
return concatText;
}

public String greaterThanRule( int ruleIndex, String varNodeName,
long param1 )
{
String greaterThanText =
"<xsl:variable name=\"var" + ruleIndex + "\" select=\""
+ "number(" + varNodeName + ")\"/>"
+ "<xsl:variable name=\"result" + ruleIndex
+ "\" select=\" java:greaterThan($var" + ruleIndex + "," + param1
+ ")\"/>";
String errorMessage = " is not greater than ";

setRejectedNodeContents( varNodeName, param1, errorMessage );
setRuleCondition( true, ruleIndex );
setNodeText( varNodeName );
setRejectedNode( varNodeName );
return greaterThanText;
}

public String greaterThanRule( int ruleIndex, String varNodeName,
String param1 )
{
String greaterThanText =
"<xsl:variable name=\"var" + ruleIndex + "\" select=\""
+ varNodeName + "\"/>"
+ "<xsl:variable name=\"result" + ruleIndex
+ "\" select=\" java:greaterThan($var" + ruleIndex + ",'" + param1
+ "')\"/>";

setRuleCondition( true, ruleIndex );
setNodeText( varNodeName );
setRejectedNode( varNodeName );
return greaterThanText;
}

public String lessThanRule( int ruleIndex, String varNodeName,
long param1 )
{
String lessThanText =
"<xsl:variable name=\"var" + ruleIndex + "\" select=\""
+ "number(" + varNodeName + ")\"/>"
+ "<xsl:variable name=\"result" + ruleIndex
+ "\" select=\" java:lessThan($var" + ruleIndex + "," + param1
+ ")\"/>";
String errorMessage = " is not less than ";

setRejectedNodeContents( varNodeName, param1, errorMessage );
setRuleCondition( true, ruleIndex );
setNodeText( varNodeName );
setRejectedNode( varNodeName );
return lessThanText;
}

public String lessThanRule( int ruleIndex, String varNodeName,
String param1 )
{
String lessThanText =
"<xsl:variable name=\"var" + ruleIndex + "\" select=\""
+ varNodeName + "\"/>"
+ "<xsl:variable name=\"result" + ruleIndex
+ "\" select=\" java:lessThan($var" + ruleIndex + ",'" + param1
+ "')\"/>";

String errorMessage = " is not less than ";
setRejectedNodeContents( varNodeName, param1, errorMessage );
setRuleCondition( true, ruleIndex );
setNodeText( varNodeName );
setRejectedNode( varNodeName );
return lessThanText;
}

public String lengthRule( int ruleIndex, String varNodeName, int max )
{
String equalText =
"<xsl:variable name=\"var" + ruleIndex + "\" select=\""
+ varNodeName + "\"/>"
+ "<xsl:variable name=\"result" + ruleIndex
+ "\" select=\" java:length($var" + ruleIndex + "," + max + ")\"/>";

String errorMessage = " length is not equal to ";
setRejectedNodeContents( varNodeName, max, errorMessage );
setRuleCondition( true, ruleIndex );
setNodeText( varNodeName );
setRejectedNode( varNodeName );
return equalText;
}

public String betweenRule( int ruleIndex, int min, String varNodeName,
int max )
{
String betweenText =
"<xsl:variable name=\"var" + ruleIndex + "\" select=\""
+ varNodeName + "\"/>"
+ "<xsl:variable name=\"result" + ruleIndex
+ "\" select=\" java:between(" + min + "," + "$var" + ruleIndex
+ "," + max + ")\"/>";

String errorMessage = " is not between ";
setRejectedNodeContents( varNodeName, max, errorMessage, min );
setRuleCondition( true, ruleIndex );
setNodeText( varNodeName );
setRejectedNode( varNodeName );
return betweenText;
}

public String betweenRule( int ruleIndex, String min, String varNodeName,
String max )
{
String betweenText =
"<xsl:variable name=\"var" + ruleIndex + "\" select=\""
+ varNodeName + "\"/>"
+ "<xsl:variable name=\"result" + ruleIndex
+ "\" select=\" java:between('" + min + "'," + "$var" + ruleIndex
+ ",'" + max + "')\"/>";

String errorMessage = " is not between ";
setRejectedNodeContents( varNodeName, max, errorMessage, min );
setRuleCondition( true, ruleIndex );
setNodeText( varNodeName );
setRejectedNode( varNodeName );
return betweenText;
}
}


(Note that in the original implementation, there were several other "rule" methods; I have cut them down to the current set here for relative brevity.)

Let's take stock of this class. One good thing about it is that its methods are small and typically have meaningful names. However, it contains quite a lot of duplicated code, and there are a few method names that are misleading.

Method Names



First, we'll address the method names, since it's the easiest part (Alt-Shift-R in Eclipse to rename a method and all its references; I'm sure there are similar keystrokes in other IDEs). When a method name starts with "get" or "set", Java developers will assume that these are JavaBean-style accessor methods. Therefore, methods like setNodeText, which actually append to a string, should be renamed to something like addNodeText. Here's the full list of method renaming that I did:







OldNew
setNodeText(String)addChildNodeIfNotPresent
setNodeText(String, int)addProcessedChildNode
setRejectedNodeaddRejectedNode
setRejectedNodeContentsaddRejectedNodeContents
setRuleConditionaddRuleCondition


After renaming, we can run our unit test, and it still works, so we know we haven't broken anything.

Duplication



One of the cardinal rules of software design is Don't Repeat Yourself (DRY for short). If you have repeated code -- even small pieces of repeated code -- then not only is the code longer, but changes to one section have to be repeated in other sections. This makes it very easy to miss, especially if the person maintaining the code is not the person writing it, and therefore introduces bugs.

Admittedly, we're not really covering design issues yet, just third-party API usage (we'll get to design in part 4). However, we can introduce third-party APIs in much fewer places if we reduce the duplicated code first. So before we go any further, let's clean this up.

We'll start from the top down -- there are four addRejectedNodeContents methods now, and they're mostly the same. The first two are almost identical, and the second two are almost identical. So let's combine each pair as follows:


private void addRejectedNodeContents( String nodeName, long param,
String ruleText )
{
addRejectedNodeContents( nodeName, String.valueOf( param ), ruleText );
}

private void addRejectedNodeContents( String nodeName, String param,
String ruleText )
{
rejectNodeContents += "<xsl:value-of select=\"' Rejected: '\" />"
+ "<xsl:value-of select=\"" + nodeName + "\" />"
+ "<xsl:value-of select=\"'" + ruleText + param + "'\" />";
}

private void addRejectedNodeContents( String nodeName, int max,
String ruleText, int min )
{
addRejectedNodeContents( nodeName, String.valueOf( max ), ruleText,
String.valueOf( min ) );
}

private void addRejectedNodeContents( String nodeName, String max,
String ruleText, String min )
{
rejectNodeContents +=
"<xsl:value-of select=\"' Rejected: '\" />"
+ "<xsl:value-of select=\"" + nodeName + "\" />"
+ "<xsl:value-of select=\"'" + ruleText + min + " and " + max
+ "'\" />";
}


But there's still duplication between the second and fourth methods. If we factor out the error message construction (which I've renamed from "ruleText" to "errorMessage" to be clearer), then we can move to one method that actually constructs the XSLT.


private void addRejectedNodeContents( String nodeName, long param,
String errorMessage )
{
addRejectedNodeContents( nodeName, String.valueOf( param ),
errorMessage );
}

private void addRejectedNodeContents( String nodeName, String param,
String errorMessage )
{
addRejectedNodeContents( nodeName, errorMessage + param );
}

private void addRejectedNodeContents( String nodeName, int max,
String errorMessage, int min )
{
addRejectedNodeContents( nodeName, String.valueOf( max ), errorMessage,
String.valueOf( min ) );
}

private void addRejectedNodeContents( String nodeName, String max,
String errorMessage, String min )
{
addRejectedNodeContents( nodeName, errorMessage + min + " and " + max );
}

private void addRejectedNodeContents( String nodeName, String errorMessage )
{
rejectNodeContents +=
"<xsl:value-of select=\"' Rejected: '\" />"
+ "<xsl:value-of select=\"" + nodeName + "\" />"
+ "<xsl:value-of select=\"'" + errorMessage + "'\" />";
}


Hooray, no duplication! If we run our unit test again, we'll see that it still works.

The rest of the duplicated code is mostly in the rule methods. From a brief analysis, we can see that the concatRule and replaceRule methods are similar to each other (string processing rules); and that the other rule methods are similar to each other (validation rules). Let's start by combining concatRule and replaceRule as follows.


public String concatRule( int ruleIndex, String varNodeName,
String param1 )
{
System.out.println( "In concatRule" );
String resultExpression =
"java:concat($var" + ruleIndex + ",'" + param1 + "')";
return stringProcessingRule( ruleIndex, varNodeName, resultExpression );
}

public String replaceRule( int ruleIndex, String oldValue,
String varNodeName, String newValue )
{
String resultExpression =
"java:replace('" + oldValue + "'," + "$var" + ruleIndex + ",'"
+ newValue + "')";
return stringProcessingRule( ruleIndex, varNodeName, resultExpression );
}

private String stringProcessingRule( int ruleIndex, String varNodeName,
String resultExpression )
{
if ( ruleCondition.equals( "" ) )
{
ruleCondition = "1";
}

String concatText = "<xsl:variable name=\"var" + ruleIndex
+ "\" select=\"" + varNodeName + "\"/>"
+ "<xsl:variable name=\"result" + ruleIndex + "\" select=\" "
+ resultExpression + ")\"/>";

System.out.println( "concatText = " + concatText );

addProcessedChildNode( varNodeName, ruleIndex );
addRejectedNode( varNodeName );
return concatText;
}


I've also replaced the following snippet:


if ( ruleCondition == "" )


with the following:


if ( ruleCondition.equals( "" ) )


In Java, one must always compare strings using the equals method, not via the == operator. In this particular case, the code will work, because the "" string will be interned by the compiler, so only one copy will be stored in the class file. But if an instance of the class is serialized, or some maintenance programmer comes along and writes code that appends the empty string to ruleCondition in some cases, or whatever, then suddenly "" will not be the same object in memory as "", and your code will have very difficult-to-find bugs in it. Furthermore, if you were comparing a static string to a ruleCondition that had already been built up dynamically, the results would never be equal. So in general, always use equals to compare strings.

Now on to the other rules. We can combine the two greaterThanRule methods as follows:


public String greaterThanRule( int ruleIndex, String varNodeName,
long param1 )
{
return greaterThanRule( ruleIndex, varNodeName,
"number(" + varNodeName + ")", String.valueOf( param1 ),
String.valueOf( param1 ) );
}

public String greaterThanRule( int ruleIndex, String varNodeName,
String param1 )
{
return greaterThanRule( ruleIndex, varNodeName, varNodeName, param1,
"'" + param1 + "'" );
}

private String greaterThanRule( int ruleIndex, String varNodeName,
String varValue, String param, String paramValue )
{
String greaterThanText =
"<xsl:variable name=\"var" + ruleIndex + "\" select=\""
+ varValue + "\"/>"
+ "<xsl:variable name=\"result" + ruleIndex
+ "\" select=\" java:greaterThan($var" + ruleIndex + ","
+ paramValue + ")\"/>";

String errorMessage = " is not greater than ";
addRejectedNodeContents( varNodeName, param, errorMessage );
addRuleCondition( true, ruleIndex );
addChildNodeIfNotPresent( varNodeName );
addRejectedNode( varNodeName );
return greaterThanText;
}


We can make similar changes for lessThanRule, and for betweenRule. The betweenRule code is shown below:


public String betweenRule( int ruleIndex, int min, String varNodeName,
int max )
{
return betweenRule( ruleIndex, String.valueOf( min ),
String.valueOf( min ), varNodeName, "number(" + varNodeName + ")",
String.valueOf( max ), String.valueOf( max ) );
}

public String betweenRule( int ruleIndex, String min, String varNodeName,
String max )
{
return betweenRule( ruleIndex, min, "'" + min + "'", varNodeName,
varNodeName, max, "'" + max + "'" );
}

private String betweenRule( int ruleIndex, String min, String minValue,
String varNodeName, String varValue, String max, String maxValue )
{
String betweenText =
"<xsl:variable name=\"var" + ruleIndex + "\" select=\""
+ varValue + "\"/>"
+ "<xsl:variable name=\"result" + ruleIndex
+ "\" select=\" java:between(" + minValue + "," + "$var" + ruleIndex
+ "," + maxValue + ")\"/>";

String errorMessage = " is not between ";
addRejectedNodeContents( varNodeName, max, errorMessage, min );
addRuleCondition( true, ruleIndex );
addChildNodeIfNotPresent( varNodeName );
addRejectedNode( varNodeName );
return betweenText;
}


You may have noticed that the generic betweenRule method now has seven parameters. This is way too many. Ideally a method should require no more than three parameters (fewer if possible); more than that tends to indicates a design problem of some sort. And indeed there is a design problem here...but we'll tackle that in a later installment.

We're getting there, but we still have four rule methods (greaterThanRule, lessThanRule, lengthRule and betweenRule) that share a lot of code. The first three are similar, since they both take a single parameter; the betweenRule is a little different, since it takes two parameters (the min and max). So let's consolidate the first three methods:


private String greaterThanRule( int ruleIndex, String varNodeName,
String varValue, String param, String paramValue )
{
return oneParameterValidationRule( ruleIndex, varNodeName, varValue,
"greaterThan", param, paramValue, " is not greater than " );
}

private String lessThanRule( int ruleIndex, String varNodeName,
String varValue, String param, String paramValue )
{
return oneParameterValidationRule( ruleIndex, varNodeName, varValue,
"lessThan", param, paramValue, " is not less than " );
}

public String lengthRule( int ruleIndex, String varNodeName, int max )
{
return oneParameterValidationRule(ruleIndex, varNodeName, varNodeName,
"length", String.valueOf( max ), String.valueOf( max ),
" length is not equal to " );
}

private String oneParameterValidationRule( int ruleIndex,
String varNodeName, String varValue, String functionName, String param,
String paramValue, String errorMessage )
{
String ruleText =
"<xsl:variable name=\"var" + ruleIndex + "\" select=\""
+ varValue + "\"/>"
+ "<xsl:variable name=\"result" + ruleIndex
+ "\" select=\" java:" + functionName + "($var" + ruleIndex + ","
+ paramValue + ")\"/>";

addRejectedNodeContents( varNodeName, errorMessage + param );
addRuleCondition( true, ruleIndex );
addChildNodeIfNotPresent( varNodeName );
addRejectedNode( varNodeName );
return ruleText;
}


And finally, we can consolidate the oneParameterValidationRule and the betweenRule into a single validationRule:


private String oneParameterValidationRule( int ruleIndex,
String varNodeName, String varValue, String functionName, String param,
String paramValue, String errorMessage )
{
String resultExpression =
"java:" + functionName + "($var" + ruleIndex + "," + paramValue
+ ")";
return validationRule( ruleIndex, varNodeName, varValue,
resultExpression, errorMessage + param );
}

private String betweenRule( int ruleIndex, String min, String minValue,
String varNodeName, String varValue, String max, String maxValue )
{
String resultExpression =
"java:between(" + minValue + "," + "$var" + ruleIndex + ","
+ maxValue + ")";
return validationRule( ruleIndex, varNodeName, varValue,
resultExpression, " is not between " + min + " and " + max );
}

private String validationRule( int ruleIndex, String varNodeName,
String varValue, String resultExpression, String errorMessage )
{
String ruleText =
"<xsl:variable name=\"var" + ruleIndex + "\" select=\""
+ varValue + "\"/>"
+ "<xsl:variable name=\"result" + ruleIndex
+ "\" select=\" " + resultExpression + "\"/>";

addRejectedNodeContents( varNodeName, errorMessage );
addRuleCondition( true, ruleIndex );
addChildNodeIfNotPresent( varNodeName );
addRejectedNode( varNodeName );
return ruleText;
}


If we wanted to be really thorough, we would combine stringProcessingRule and validationRule; but this is probably not worth the effort at this point, since the two methods are fairly different.

Now that we've consolidated a lot of these methods, we can actually remove all the addRejectedNodeContents methods that we refactored earlier except for the fifth one that contains all the guts of the code in it. None of the rest are used anymore!

However, we have one problem: by this point, the test is failing. Why is that? Well, it turns out there were originally errors in one of the betweenRule methods (which did not put the number() function around the variable in the integer case) and one of the greaterThanRule methods (which did not put the rejection error message in the output). By adding test cases and consolidating all these duplicated methods, we have exposed and eliminated bugs in the original implementation.

Language Usage



One of the changes we wanted to make to StylesheetCreator in part 2 was to pass in a List<String> for the child node names, not a comma-delimited String containing all the names. Since this alters the public API of StylesheetCreator, let's alter our test first. We'll change this:


stylesheetCreator.createStylesheet( "employee",
"name,age,salary,category,department,bonus", rules, "test.xsl" );


to this:


stylesheetCreator.createStylesheet(
"employee",
Arrays.asList( "name", "age", "salary", "category", "department",
"bonus" ),
rules,
"test.xsl" );


The only reason we use Arrays.asList is that it's a quick shortcut for making an array from a fixed list of items. Java 7 is supposed to introduce list literals, at which point one will be able to type ["name", "age", "salary"] and so forth, like many other languages.

So now that we've altered the test case, of course, it won't compile! Let's alter StylesheetCreator to suit. We'll change it from this:


public void createStylesheet( String parentNodeName, String childNodeNames,
List<List<String>> ruleList, String outputFilename ) throws Exception
{
...
// Get header and footer stylesheet content.
String stylesheetHeader = stylesheetHelper.getStylesheetHeader(
parentNodeName, childNodeNames );
...
}


to this:


public void createStylesheet( String parentNodeName,
List<String> childNodeNames, List<List<String>> ruleList,
String outputFilename ) throws Exception
{
...
// Get header and footer stylesheet content.
String stylesheetHeader = stylesheetHelper.getStylesheetHeader(
parentNodeName, childNodeNames );
...
}


And we'll change StylesheetHelper to suit:


public class StylesheetHelper
{
...
private List<String> childNodeNames = null;
...

public String getStylesheetHeader( String parentNodeName,
List<String> childNodeNames )
{
this.parentNodeName = parentNodeName;
this.childNodeNames = childNodeNames;
...
}

public String getParentNode()
{
...
if ( !childNodeNames.isEmpty() )
{
prepareChildNodes();
}
...
}

public String getRejectedParentNode()
{
prepareRejectedChildNodes();
...
}

private void prepareChildNodes()
{
...
for ( String childNodeName : childNodeNames )
{
addChildNodeIfNotPresent( childNodeName );
}
}

private void prepareRejectedChildNodes()
{
...
for ( String childNodeName : childNodeNames )
{
addRejectedNode( childNodeName );
}
}
}


Since childNodeNames is a member variable, we can remove the parameter from the prepareChildNodes and prepareRejectedChildNodes methods. We no longer have to call String.split, which, as we pointed out previously, could result in incorrect results if the strings contain commas (which they shouldn't, being XML node names). We changed:

if ( !childNodeNames.equals( "" ) )


to:

if ( !childNodeNames.isEmpty() )


And we were able to use the more readable, concise enhanced for loop.

API Usage



Where would it make sense to use third-party libraries in our code? I can think of three libraries that may be useful.


  • Logging. The current code makes heavy use of System.out.println. While this is fine for very small applications, it is unwieldy for larger ones. Logging libraries allow you to control at runtime which log statements will appear, classify log statements by severity, control the format of each log statement including timestamps and class names, output to rolling log files, issue email alerts for certain classes of errors, and so forth.

    I'll use SLF4J on top of Log4J 1.2, although there are other good solutions like Commons Logging. The JDK comes with its own logging framework, but it is not as good as Log4J. By using a wrapper library like SLF4J or Commons Logging, you can swap out the logging implementation without changing all your code.

  • String escaping utilities. At present, none of the strings that are passed into the XML are escaped, including the string parameter values. If these are entered by a malicious user from a web page, or even accidentally entered incorrectly inside code, then invalid XML could be output -- or worse, valid XML that is controlled by the user to take advantage of a vulnerability in the program. You could think of this as XSLT injection.

    To combat this, we need to escape every parameter and value that is passed into StylesheetHelper. We could craft our own escaping utilities for doing this, but it would be easier to use something like StringEscapeUtils from Commons Lang.

  • XML creation. The main focus of the code we're looking at is to output an XSLT stylesheet, which is a type of XML. Though it's fairly easy to output XML manually, as we are currently doing, there are lots of pitfalls. For example, there's the encoding issue (currently StylesheetHelper outputs ISO-8559-1, which does not support any "special" characters), and the escaping issue mentioned above. Also, even if we output perfectly valid XML, we'd have to struggle mightily to get it all indented nicely for readability.

    We could use the DOM API built into Java for this purpose, but it's pretty unwieldy. Instead I'd recommend a library like JDOM, DOM4J, or XOM. For now, I'm not going to use an XML library, because it would change quite a bit of the code, and I'm lazy. For a real production application, though, I would use one. And perhaps when the code is designed a little better, we'll go ahead and do that.



Logging



So first, let's look at logging. By convention, a separate logger is created for each class; this enables us to differentiate output by classes in the resulting log file.


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MyClass
{
private static final Logger log = LoggerFactory.getLogger( MyClass.class );
}


It is private because no other classes should use it; static because it is shared amongst all class instances (and can be used by static methods); and final because it should never be re-initialized. We'll put a statement like this at the top of both StylesheetCreator and StylesheetHelper.

To actually log a message, there are several different methods, corresponding to different severities of error, called "categories" in Log4J:


  • fatal: indicates that a fatal error has occurred
  • error: indicates that some error (unexpected condition) has occurred; often used when an exception is thrown unexpectedly
  • warn: warn that something happened that might be problematic, but is recoverable
  • info: log a statement that may be of interest, but does not indicate a problem
  • debug: log a statement that is only useful when debugging the application
  • trace: log the entrance to or exit from a method, or other messages that are at a very fine level of detail


In that vein, let's change all the lines that look something like this:


System.out.println( "Inside createStylesheet" );


to lines like this:


log.trace( "Inside createStylesheet" );


Likewise, we'll change the debugging statements like this:


System.out.println("Rule table values: " + columnName + " " +
ruleId + " " + param1 + " " + param2 );


to statements like this:


log.debug("Rule table values: {} {} {} {}",
new Object[] { columnName, ruleId, param1, param2 } );


In the previous example, we used a trick that SLF4J enables, where each instance of {} is replaced by the corresponding parameter in the following list. Why would we want to do this instead of the following, more traditional code?


log.debug("Rule table values: " + columnName + " " + ruleId +
" " + param1 + " " + param2 );


Well, there are two reasons. When there are several parameters, it can sometimes be more readable to see the whole string at once than lots of concatenations. But the most compelling reason is that if you are printing out objects with computationally-intensive toString methods, then those methods will only be called if the log statement is actually output, instead of being called before ever calling the log.debug method.

I'll leave it as an exercise for the reader to go through all the other System.out.println statements and change them to Logger method calls. However, there is one I'd like to point out explicitly:


log.warn( "Tried to invoke unknown rule ID {}", ruleId );


This is a warning (and should perhaps even be an error), so we want to make sure that it's output at a higher severity, so that even if we turn off debug output, we still see this error message.

Speaking of turning off debugging, how do we actually configure this system? The answer is that by default, Log4J looks for a file called log4j.properties on the classpath somewhere at runtime. The log4j.properties file that I'm using looks like this:


# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=DEBUG, A1

# A1 is set to be a ConsoleAppender.
log4j.appender.A1=org.apache.log4j.ConsoleAppender

# A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%5p [%t] (%F:%L) - %m%n


Line 2 says that by default, our logging will be at the "DEBUG" level (i.e. we'll see all levels from debug and up, but not trace messages), and when a log statement is output, it'll go to appender A1. (An appender is just a class that takes a log statement and dumps it to some sort of output.) It's possible to output log statements to multiple appenders, but we won't bother for this example. Line 5 defines A1 as a ConsoleAppender, which as you might guess outputs its statements to the console. Lines 8 and 9 configure the ConsoleAppender by telling it what pattern to use to output log statements. This format will output the severity (5 characters), then the time, then the file and line number, then the message, and finally a newline, thus:


DEBUG [main] (StylesheetCreator.java:35) - Rule table values: age 2 18


For more information on how to configure Log4J, please see the documentation.

String Escaping



What will happen right now if we input a parameter to the greaterThanRule with the following value?


-a)'/><xsl:value-of select="java:deleteAllFiles();"/>
<xsl:variable value="dummy" select="string('


Well, our XML output will contain something like the following:


<xsl:variable name=\"var3\" select=\"category\"/>
<xsl:variable name=\"result3\" select=\" java:greaterThan($var3,'-a)'/>
<xsl:value-of select="java:deleteAllFiles();"/>
<xsl:variable value="dummy" select="string('')\"/>


If users are able to specify their own parameter values (say, via the web interface), then they can force our program to call the deleteAllFiles() function, which presumably does something bad. This is called injection, and great care needs to be taken in order to prevent this sort of thing from happening.

If we were to use an XML library like JDOM, strings like this would be escaped without us having to think about it. However, using JDOM at this stage would be difficult; so I'm going to opt for a rather low-tech solution: string escaping. We could write our own function, but I'm not going to reinvent the wheel. Instead, we'll use Commons Lang, which contains a handy StringEscapeUtils.escapeXml function that does exactly what we need.

Let's insert string escaping into the addProcessedChildNode method. Previously it looked like this:


private void addProcessedChildNode( String nodeName, int resultVarIndex )
{
childNodes += "<" + nodeName + "><xsl:value-of select=\"$result"
+ resultVarIndex + "\"/>" + "</" + nodeName + ">";
}


With the addition of StringEscapeUtils.escapeXml, it looks like this:


private void addProcessedChildNode( String nodeName, int resultVarIndex )
{
String escapedNodeName = StringEscapeUtils.escapeXml( nodeName );
childNodes += "<" + escapedNodeName + "><xsl:value-of select=\"$result"
+ resultVarIndex + "\"/>" + "</" + escapedNodeName + ">";
}


The following methods will need to be modified in the same way:


  • getStylesheetHeader

  • getStylesheetFooter

  • addRejectedNodeContents

  • addRejectedNode

  • addChildNodeIfNotPresent

  • addProcessedChildNode

  • stringProcessingRule

  • validationRule



We have to take care that we do not escape the same string twice.

Now, the test will not pass, because StringEscapeUtils also escapes single quote marks as &apos;. This is a harmless change: the XML parser will consider the single quotes and &apos; entities to be identical, although it isn't as pretty. So we can alter the test case accordingly.

There is still a way our users can inject malicious code into our files: the string parameters can still contain single quotes. These will now be escaped so that they do not break the XML, but they are not escaped to prevent XPath from processing them as the end of a string. I'm not going to fix the program to handle this right now, since it is surprisingly difficult, but it should be done in an actual production system.

And that's it! We saw how to reduce the duplication in StylesheetHelper, and introduced two third-party APIs: SLF4J (wrapping Log4J) and Commons Lang. Next time we'll start on the really fun stuff: improving the code's design! Stay tuned....

No comments: