Aarrgghh!!

One guy's take on the web, programming, cigars, politics, Philadelphia, and whatever else comes to mind.

Google Sitemap Creator

Explanation

Google has started a standardized sitemap protocol. They have a Python based solution to automatically create a sitemap. However I don't believe I have access to Python on this server. My solution, write my own in CFML. So here's how I do it.

I use a CFC named sitemap.cfc that does four things.

  1. It parses the directory structure, ingnoring and including files according to my settings.
  2. It converts the file list from local paths to a list of urls and last modified data.
  3. It adds priority and frequency attributes to the query, based on criteria specific to my site.
  4. It converts the query to Google compliant xml.

Sample Usage

Calling this code:

		


1: <cfinvoke component="cfc.sitemap" method="convert_directory_to_sitemap" returnvariable="url_list">
2: 	<cfinvokeargument name="file_root" value="#application.file_root#" />
3: 	<cfinvokeargument name="url_root" value="#application.url#/" />
4: 	<cfinvokeargument name="extenstions_included" value="cfm,xml" />
5: 	<cfinvokeargument name="directories_excluded" value="#dir_ex_list#" />
6: 	<cfinvokeargument name="files_excluded" value="#file_ex_list#" />
7: 	<cfinvokeargument name="sitemap_file" value="#application.file_root#\sitemap.xml" />
8: </cfinvoke>


Will return:

Sitemap

Code

Source

		


1: <cfcomponent hint="Encapsulates all of the functionality involved with the creation of a Google sitemap.">
2: 	<cffunction access="public" name="parse_directory" output="false" returntype="query" hint="Converts individual files from directory structure into urls.">
3: 		<cfargument name="file_root" type="string" required="yes" hint="The file root of the directory to scan." >
4: 		<cfargument name="url_root" type="string" required="yes" hint="The url root of the site.">
5: 		<cfargument name="extenstions_excluded" type="string" required="no" default="" hint="Extenstions to Exclude.">
6: 		<cfargument name="extenstions_included" type="string" required="no" default="" hint="Extenstions to which to restrict inclusion.">
7: 		<cfargument name="directories_excluded" type="string" required="no" default="" hint="Directory to Exclude">
8: 		<cfargument name="directories_included" type="string" required="no" default="" hint="Directory to which to restrict inclusion.">
9: 		<cfargument name="files_excluded" type="string" required="no" default="" hint="Files to Exclude">
10: 		<cfargument name="files_included" type="string" required="no" default="" hint="Files to which to restrict inclusion.">
11: 	
12: 		<cfset arguments.extenstions_excluded =ListQualify(arguments.extenstions_excluded,"'")>
13: 		<cfset arguments.extenstions_included =ListQualify(arguments.extenstions_included,"'")>
14: 		<cfset arguments.directories_excluded =ListQualify(arguments.directories_excluded,"'")>
15: 		<cfset arguments.directories_included =ListQualify(arguments.directories_included,"'")>
16: 		<cfset arguments.files_excluded =ListQualify(arguments.files_excluded,"'")>
17: 		<cfset arguments.files_included =ListQualify(arguments.files_included,"'")>
18: 			
19: 		<cfdirectory directory="#arguments.file_root#" action="list" name="files" filter="*" recurse="yes">
20: 		
21: 		<cfset extenstionArray=ArrayNew(1)>
22: 		
23: 		<cfloop query="files">
24: 			<cfif FindNoCase(".", name)>
25: 				<cfset extenstionArray[CurrentRow]=ReplaceNoCase(Right(name[CurrentRow],3),".", "", "ALL")>
26: 			<cfelse>
27: 				<cfset extenstionArray[CurrentRow]="">
28: 			</cfif>
29: 		</cfloop>
30: 
31: 		<cfset QueryAddColumn(files, "extenstion", extenstionArray)>
32: 		
33: 		<cfquery name="files" dbtype="query">
34: 			select 	*
35: 			from  	files
36: 			where 	type != 'Dir'
37: 			<cfif Len(arguments.extenstions_excluded)>
38: 			and 	extenstion not in (#PreserveSingleQuotes(arguments.extenstions_excluded)#) 
39: 			</cfif>
40: 			<cfif Len(arguments.extenstions_included)>
41: 			and 	extenstion in (#PreserveSingleQuotes(arguments.extenstions_included)#) 
42: 			</cfif>
43: 			<cfif Len(arguments.directories_excluded)>
44: 			and 	directory not in (#PreserveSingleQuotes(arguments.directories_excluded)#) 
45: 			</cfif>
46: 			<cfif Len(arguments.directories_included)>
47: 			and 	directory in (#PreserveSingleQuotes(arguments.directories_included)#) 
48: 			</cfif>
49: 			<cfif Len(arguments.files_excluded)>
50: 			and 	name not in (#PreserveSingleQuotes(arguments.files_excluded)#) 
51: 			</cfif>
52: 			<cfif Len(arguments.directories_included)>
53: 			and 	name in (#PreserveSingleQuotes(arguments.files_included)#) 
54: 			</cfif>
55: 			
56: 		</cfquery>
57: 		
58: 		<cfset url_info_query=QueryNew("url,datelastmodified")>
59: 		
60: 		<cfloop query="files">
61: 			<cfset QueryAddRow(url_info_query)>
62: 			<cfset url_string = directory & "\" & name>
63: 			<cfset url_string = Replace(url_string, arguments.file_root, arguments.url_root, "ALL")>
64: 			<cfset url_string = Replace(url_string, "\", "/", "ALL")>
65: 			<cfset url_string = Replace(url_string, "//", "/", "ALL")>66: 			<cfset QuerySetCell(url_info_query, "url", url_string)>
67: 			<cfset QuerySetCell(url_info_query, "datelastmodified", datelastmodified)>
68: 		</cfloop>
69: 
70: 	
71: 		<cfreturn url_info_query>
72: 	</cffunction>
73: 	
74: 	<cffunction access="public" name="create_google_sitemap" output="false" returntype="void" hint="Create a file based Google sitemap.">
75: 		<cfargument name="url_list" type="query" required="no" default="" hint="The url query.">
76: 		<cfargument name="sitemap_file" type="string" required="no" default="" hint="The filename.">
77: 	
78: 		<cfset sitemap = "">
79: 
80: 		<cfset sitemap = sitemap & "<?xml version='1.0' encoding='UTF-8'?>" & chr(13) & chr(10)>
81: 		<cfset sitemap = sitemap & "<urlset xmlns=""http://www.google.com/schemas/sitemap/0.84""">
82: 		<cfset sitemap = sitemap & "xmlns:xsi=""http://www.w3.org/2001/XMLSchema-instance"" ">
83: 		<cfset sitemap = sitemap & "xsi:schemaLocation=""http://www.google.com/schemas/sitemap/0.84 http://www.google.com/schemas/sitemap/0.84/sitemap.xsd"">" & chr(13) & chr(10)>
84: 		
85: 		<cfloop query="arguments.url_list">
86: 			<cfset loop_nugget= "">
87: 			<cfset loop_nugget = loop_nugget & "<url>" & chr(13) & chr(10)>
88: 			<cfset loop_nugget = loop_nugget & "<loc>" & XmlFormat(url) & "</loc>" & chr(13) & chr(10)>
89: 			<cfset loop_nugget = loop_nugget & "<lastmod>" & DateFormat(datelastmodified,"yyyy-mm-dd") & "</lastmod>" & chr(13) & chr(10)>
90: 			<cfif isDefined("changefreq") and Len(changefreq) gt 0>
91: 				<cfset loop_nugget = loop_nugget & "<changefreq>" & changefreq & "</changefreq>" & chr(13) & chr(10)>
92: 			</cfif>
93: 			<cfif isDefined("priority") and Len(priority) gt 0>
94: 				<cfset loop_nugget = loop_nugget & "<priority>" & priority & "</priority>" & chr(13) & chr(10)>
95: 			</cfif>
96: 			<cfset loop_nugget = loop_nugget & "</url>" & chr(13) & chr(10)>
97: 			<cfset sitemap = sitemap & loop_nugget>
98: 		</cfloop>
99: 		
100: 		<cfset sitemap = sitemap & "</urlset>" & chr(13) & chr(10)>
101: 		
102: 		<cffile action="write" addnewline="no" file="#arguments.sitemap_file#" output="#sitemap#" fixnewline="no">
103: 	
104: 	</cffunction>
105: 	
106: 	<cffunction access="public" name="set_google_sitemap_attributes" output="false" returntype="query" hint="Munges the query to produce priorities and changefrequency.">
107: 		<cfargument name="url_list" type="query" required="yes" hint="The Query to munge.">
108: 	
109: 		<cfset url_list=arguments.url_list>
110: 		<cfset priorityArray=ArrayNew(1)>
111: 		<cfset changeArray=ArrayNew(1)>
112: 				
113: 		<cfloop query="url_list">
114: 			<cfset priorityArray[CurrentRow]= .5>
115: 			
116: 			<cfif findNoCase("index",url)>
117: 				<cfset priorityArray[CurrentRow]= .9>
118: 			</cfif>
119: 			
120: 			<cfif findNoCase("http:/www.numtopia.com/terry/index.cfm",url)>
121: 				<cfset priorityArray[CurrentRow]= 1.0>
122: 			</cfif>
123: 		
124: 		</cfloop>
125: 	
126: 		<cfloop query="url_list">
127: 			<cfset changeArray[CurrentRow]= "monthly">
128: 			
129: 			<cfif findNoCase("archives",url)>
130: 				<cfif findNoCase("index",url)>
131: 					<cfset changeArray[CurrentRow]= "monthly">
132: 				<cfelse>
133: 					<cfset changeArray[CurrentRow]= "never">
134: 				</cfif>
135: 			</cfif>
136: 			
137: 			<cfif findNoCase("index",url) and not findNoCase("archives",url)>
138: 				<cfset changeArray[CurrentRow]= "daily">
139: 			</cfif>
140: 		
141: 		</cfloop>
142: 		
143: 		<cfset QueryAddColumn(url_list, "priority", priorityArray)>
144: 		<cfset QueryAddColumn(url_list, "changefreq", changeArray)>
145: 		
146: 		<cfreturn url_list>
147: 	</cffunction>
148: 	
149: 	<cffunction access="public" name="convert_directory_to_sitemap" output="false" returntype="void" hint="Converts the directory structure of a site into a Google sitemap. ">
150: 		<cfargument name="file_root" type="string" required="yes" hint="The file root of the directory to scan." >
151: 		<cfargument name="url_root" type="string" required="yes" hint="The url root of the site.">
152: 		<cfargument name="extenstions_excluded" type="string" required="no" default="" hint="Extenstions to Exclude.">
153: 		<cfargument name="extenstions_included" type="string" required="no" default="" hint="Extenstions to which to restrict inclusion.">
154: 		<cfargument name="directories_excluded" type="string" required="no" default="" hint="Directory to Exclude">
155: 		<cfargument name="directories_included" type="string" required="no" default="" hint="Directory to which to restrict inclusion.">
156: 		<cfargument name="files_excluded" type="string" required="no" default="" hint="Files to Exclude">
157: 		<cfargument name="files_included" type="string" required="no" default="" hint="Files to which to restrict inclusion.">
158: 		<cfargument name="sitemap_file" type="string" required="no" default="" hint="The filename.">
159: 	
160: 		<cfinvoke method="parse_directory" returnvariable="url_list" argumentcollection="#arguments#" />
161: 				
162: 		<cfinvoke method="set_google_sitemap_attributes" returnvariable="url_list">
163: 			<cfinvokeargument name="url_list" value="#url_list#" />
164: 		</cfinvoke>
165: 		
166: 		<cfinvoke method="create_google_sitemap">
167: 			<cfinvokeargument name="url_list" value="#url_list#" />
168: 			<cfinvokeargument name="sitemap_file" value="#arguments.sitemap_file#" />
169: 		</cfinvoke>
170: 	
171: 	</cffunction>
172: 
173: </cfcomponent>