| Jeff's |
Barney's #1 |
Barney's #2 |
|
|
|
Query Execution Time (ms):0 Recordcount:10 Cached:No
| | word | occurrences | | 1 | horse | 4 | | 2 | course | 3 | | 3 | smog | 2 | | 4 | keyboard | 1 | | 5 | silly | 1 | | 6 | magma | 1 | | 7 | glasses | 1 | | 8 | mouse | 1 | | 9 | vodka | 1 | | 10 | liquid | 1 |
|
<cfparam name="attributes.text" default="A horse is a horse, of course, of course, and you cannot talk to a horse, of course. smog liquid magma smog silly horse helicopter keyboard mouse glasses vodka." />
<cfset ignoreList = "all,another,any,anybody,anyone,anything,both,each,either,everybody,everyone,everything,few,he,her,hers,herself,him,himself,his,I,it,its,itself,little,many,me,mine,more,most,much,myself,neither,no,one,nobody,none,nothing,one,one another,other,others,ours,ourselves,several,she,some,somebody,someone,something,that,theirs,them,themselves,these,they,this,those,us,we,what,whatever,which,whichever,who,whoever,whom,whomever,whose,you,yours,yourself,yourselves,,a,the,to,are,of,can,is,but,have,that,want,What,my,an,for,all,out,and,look,very,need,get,case" />
<cfset hb1 = getHistogram_barney1(attributes.text, ignoreList, 10) />
<cfset hb2 = getHistogram_barney2(attributes.text, ignoreList, 10) />
<cfset hj = getHistogram_jeff(attributes.text, ignoreList, 10) />
<cfoutput>
<style type="text/css">
td { vertical-align: top }
</style>
<form method="post" action="?">
<textarea name="text" style="width:650px;height:100px;">#htmlEditFormat(attributes.text)#</textarea><br />
<input type="submit" value="build histogram" />
</form>
<table>
<tr>
<th>Jeff's</th>
<th>Barney's ##1</th>
<th>Barney's ##2</th>
</tr>
<tr>
<td><cfdump var="#hj#" /></td>
<td><cfdump var="#hb1#" /></td>
<td><cfdump var="#hb2#" /></td>
</tr>
</table>
<hr />
<pre>#htmlEditFormat(fileRead(getCurrentTemplatePath()))#</pre>
</cfoutput>
<cffunction name="getHistogram_barney1">
<cfargument name="text" />
<cfargument name="ignoreList" />
<cfargument name="maxItems" />
<cfset var list = "" />
<cfset var lookup = {} />
<cfset var word = "" />
<cfset var ignores = createObject("java", "java.util.HashSet").init(listToArray(ignoreList)) />
<cfset text = lCase(REReplace(text, "[^a-zA-Z]+", " ", "all")) />
<cfloop list="#text#" index="word" delimiters=" ">
<!--- c'mon, where's CFCONTINE? --->
<cfif NOT ignores.contains(word)>
<cfif structKeyExists(lookup, word)>
<cfset lookup[word] += 1 />
<cfelse>
<cfset lookup[word] = 1 />
</cfif>
</cfif>
</cfloop>
<cfset list = structSort(lookup, "numeric", "desc") />
<cfif arrayLen(list) GT maxItems>
<cfset list = subList(list, 0, maxItems) />
</cfif>
<cfreturn list />
</cffunction>
<cffunction name="subList" output="false" returntype="array">
<cfargument name="a" />
<cfargument name="s" />
<cfargument name="e" />
<cfset var aa = [] />
<cfset var i = "" />
<cfloop from="#s + 1#" to="#e#" index="i">
<cfset arrayAppend(aa, a[i]) />
</cfloop>
<cfreturn aa />
</cffunction>
<cffunction name="getHistogram_barney2">
<cfargument name="text" />
<cfargument name="ignoreList" />
<cfargument name="maxItems" />
<cfset var result = "" />
<cfset var lookup = {} />
<cfset var list = "" />
<cfset var word = "" />
<cfset var ignores = createObject("java", "java.util.HashSet").init(listToArray(ignoreList)) />
<cfset text = lCase(REReplace(text, "[^a-zA-Z]+", " ", "all")) />
<cfloop list="#text#" index="word" delimiters=" ">
<!--- c'mon, where's CFCONTINE? --->
<cfif NOT ignores.contains(word)>
<cfif structKeyExists(lookup, word)>
<cfset lookup[word] += 1 />
<cfelse>
<cfset lookup[word] = 1 />
</cfif>
</cfif>
</cfloop>
<cfset list = structSort(lookup, "numeric", "desc") />
<cfif arrayLen(list) GT maxItems>
<cfset list = subList(list, 0, maxItems) />
</cfif>
<!--- version 1 returned 'list', here create a query with counts --->
<cfset result = queryNew("word,occurrences", "varchar,integer") />
<cfloop array="#list#" index="word">
<cfset queryAddRow(result) />
<cfset querySetCell(result, "word", word) />
<cfset querySetCell(result, "occurrences", lookup[word]) />
</cfloop>
<cfreturn result />
</cffunction>
<cffunction name="getHistogram_jeff" returntype="array" hint="Creats a histogram of words">
<cfargument name="sourceText" required="true" hint="The string of text we want to generate a histogram for" type="string" />
<cfargument name="ignoreList" required="false" hint="comma delineated list of words to ignore" type="string" />
<cfargument name="histogramLength" required="false" hint="number of words that we want to send back..ie only the top 5" type="string" />
<cfset var histogramCount = structNew() /> <!--- our histogram! --->
<cfset var sortedHistogram = "" /> <!--- a sorted array of our histogram --->
<cfset var x = "" /> <!--- iterator --->
<cfset var i = "" /> <!--- iterator --->
<!--- loop through all of the text, assuming that a space separates a word --->
<cfloop delimiters=" " list="#sourceText#" index="i">
<!--- see if we have this already in our struct --->
<cfif structKeyExists(histogramCount, "#i#")>
<!--- we do! increase its count by 1 --->
<cfset histogramCount[i] = histogramCount[i] + 1 />
<cfelse>
<!--- we do not, make a new key in the struct for this word --->
<cfset histogramCount[i] = 1 />
</cfif>
</cfloop>
<!--- Do we have an ignore list? --->
<cfif structKeyExists(arguments, "ignoreList") and len(trim(arguments.ignoreList))>
<!--- loop over the list of ignore words and remove any matches from our structure --->
<cfloop delimiters="," list="#arguments.ignoreList#" index="x">
<!--- does this word occur in our struct? --->
<cfif structKeyExists(histogramCount, x)>
<!--- yes, so remove it --->
<cfset structDelete(histogramCount, x) />
</cfif>
</cfloop>
</cfif>
<!--- Sort the histogram based on most occurences of a given word --->
<cfset sortedHistogram = StructSort(histogramCount, "numeric", "DESC") />
<!--- see if we need to only show x number of words for this histogram --->
<cfif structKeyExists(arguments, "histogramLength") and len(trim(arguments.histogramLength))>
<cfset useNum = arguments.histogramLength + 1 />
<cfloop index="y" from="#arrayLen(sortedHistogram)#" to="#useNum#" step="-1">
<cfset ArrayDeleteAt(sortedHistogram, y) />
</cfloop>
</cfif>
<cfreturn sortedHistogram>
</cffunction>