Jeff's Barney's #1 Barney's #2
Array
1
stringhorse,
2
stringsmog
3
stringcourse,
4
stringhorse
5
stringkeyboard
6
stringsilly
7
stringcourse.
8
stringmagma
9
stringvodka.
10
stringglasses
Array
1
stringhorse
2
stringcourse
3
stringsmog
4
stringkeyboard
5
stringsilly
6
stringmagma
7
stringglasses
8
stringmouse
9
stringvodka
10
stringliquid
Query
Execution Time (ms):0
Recordcount:10
Cached:No
wordoccurrences
1horse4
2course3
3smog2
4keyboard1
5silly1
6magma1
7glasses1
8mouse1
9vodka1
10liquid1

<cfparam name="attributes.text" default="A horse is a horse, of course, of course, and you cannot talk to a horse, of course.  smog liquid magma smog silly horse helicopter keyboard mouse glasses vodka." />

<cfset ignoreList = "all,another,any,anybody,anyone,anything,both,each,either,everybody,everyone,everything,few,he,her,hers,herself,him,himself,his,I,it,its,itself,little,many,me,mine,more,most,much,myself,neither,no,one,nobody,none,nothing,one,one another,other,others,ours,ourselves,several,she,some,somebody,someone,something,that,theirs,them,themselves,these,they,this,those,us,we,what,whatever,which,whichever,who,whoever,whom,whomever,whose,you,yours,yourself,yourselves,,a,the,to,are,of,can,is,but,have,that,want,What,my,an,for,all,out,and,look,very,need,get,case" />

<cfset hb1 = getHistogram_barney1(attributes.text, ignoreList, 10) />
<cfset hb2 = getHistogram_barney2(attributes.text, ignoreList, 10) />
<cfset hj = getHistogram_jeff(attributes.text, ignoreList, 10) />

<cfoutput>
<style type="text/css">
td { vertical-align: top }
</style>
<form method="post" action="?">
<textarea name="text" style="width:650px;height:100px;">#htmlEditFormat(attributes.text)#</textarea><br />
<input type="submit" value="build histogram" />
</form>

<table>
<tr>
	<th>Jeff's</th>
	<th>Barney's ##1</th>
	<th>Barney's ##2</th>
</tr>
<tr>
	<td><cfdump var="#hj#" /></td>
	<td><cfdump var="#hb1#" /></td>
	<td><cfdump var="#hb2#" /></td>
</tr>
</table>
<hr />
<pre>#htmlEditFormat(fileRead(getCurrentTemplatePath()))#</pre>
</cfoutput>

<cffunction name="getHistogram_barney1">
	<cfargument name="text" />
	<cfargument name="ignoreList" />
	<cfargument name="maxItems" />
	<cfset var list = "" />
	<cfset var lookup = {} />
	<cfset var word = "" />
	<cfset var ignores = createObject("java", "java.util.HashSet").init(listToArray(ignoreList)) />
	<cfset text = lCase(REReplace(text, "[^a-zA-Z]+", " ", "all")) />
	<cfloop list="#text#" index="word" delimiters=" ">
		<!--- c'mon, where's CFCONTINE? --->
		<cfif NOT ignores.contains(word)>
			<cfif structKeyExists(lookup, word)>
				<cfset lookup[word] += 1 />
			<cfelse>
				<cfset lookup[word] = 1 />
			</cfif>
		</cfif>
	</cfloop>
	<cfset list = structSort(lookup, "numeric", "desc") />
	<cfif arrayLen(list) GT maxItems>
		<cfset list = subList(list, 0, maxItems) />
	</cfif>
	<cfreturn list />
</cffunction>



<cffunction name="subList" output="false" returntype="array">
	<cfargument name="a" />
	<cfargument name="s" />
	<cfargument name="e" />
	<cfset var aa = [] />
	<cfset var i = "" />
	<cfloop from="#s + 1#" to="#e#" index="i">
		<cfset arrayAppend(aa, a[i]) />
	</cfloop>
	<cfreturn aa />
</cffunction>



<cffunction name="getHistogram_barney2">
	<cfargument name="text" />
	<cfargument name="ignoreList" />
	<cfargument name="maxItems" />
	<cfset var result = "" />
	<cfset var lookup = {} />
	<cfset var list = "" />
	<cfset var word = "" />
	<cfset var ignores = createObject("java", "java.util.HashSet").init(listToArray(ignoreList)) />
	<cfset text = lCase(REReplace(text, "[^a-zA-Z]+", " ", "all")) />
	<cfloop list="#text#" index="word" delimiters=" ">
		<!--- c'mon, where's CFCONTINE? --->
		<cfif NOT ignores.contains(word)>
			<cfif structKeyExists(lookup, word)>
				<cfset lookup[word] += 1 />
			<cfelse>
				<cfset lookup[word] = 1 />
			</cfif>
		</cfif>
	</cfloop>
	<cfset list = structSort(lookup, "numeric", "desc") />
	<cfif arrayLen(list) GT maxItems>
		<cfset list = subList(list, 0, maxItems) />
	</cfif>
	<!--- version 1 returned 'list', here create a query with counts   --->
	<cfset result = queryNew("word,occurrences", "varchar,integer") />
	<cfloop array="#list#" index="word">
		<cfset queryAddRow(result) />
		<cfset querySetCell(result, "word", word) />
		<cfset querySetCell(result, "occurrences", lookup[word]) />
	</cfloop>
	<cfreturn result />
</cffunction>


<cffunction name="getHistogram_jeff" returntype="array" hint="Creats a histogram of words">
    <cfargument name="sourceText" required="true" hint="The string of text we want to generate a histogram for" type="string" />
    <cfargument name="ignoreList" required="false" hint="comma delineated list of words to ignore" type="string" />
    <cfargument name="histogramLength" required="false" hint="number of words that we want to send back..ie only the top 5" type="string" />

    <cfset var histogramCount = structNew() /> <!--- our histogram! --->
    <cfset var sortedHistogram = "" />  <!--- a sorted array of our histogram --->
    <cfset var x = "" /> <!--- iterator --->
    <cfset var i = "" /> <!--- iterator --->
   
    <!--- loop through all of the text, assuming that a space separates a word --->
    <cfloop delimiters=" " list="#sourceText#" index="i">
   
        <!--- see if we have this already in our struct --->
        <cfif structKeyExists(histogramCount, "#i#")>
            <!--- we do! increase its count by 1 --->
            <cfset histogramCount[i] = histogramCount[i] + 1 />
        <cfelse>
            <!--- we do not, make a new key in the struct for this word --->
            <cfset histogramCount[i] = 1 />
        </cfif>
    </cfloop>
   
   
    <!--- Do we have an ignore list? --->
    <cfif structKeyExists(arguments, "ignoreList") and len(trim(arguments.ignoreList))>
        <!--- loop over the list of ignore words and remove any matches from our structure --->
        <cfloop delimiters="," list="#arguments.ignoreList#" index="x">
            <!--- does this word occur in our struct? --->
            <cfif structKeyExists(histogramCount, x)>
                <!--- yes, so remove it --->
                <cfset structDelete(histogramCount, x) />
            </cfif>
        </cfloop>
    </cfif>

    <!--- Sort the histogram based on most occurences of a given word --->
    <cfset sortedHistogram = StructSort(histogramCount, "numeric", "DESC") />

    <!--- see if we need to only show x number of words for this histogram --->
    <cfif structKeyExists(arguments, "histogramLength") and len(trim(arguments.histogramLength))>
        <cfset useNum = arguments.histogramLength + 1 />
        <cfloop index="y" from="#arrayLen(sortedHistogram)#" to="#useNum#" step="-1">

            <cfset ArrayDeleteAt(sortedHistogram, y) />       
        </cfloop>
    </cfif>

    <cfreturn sortedHistogram>

</cffunction>