1 #!/bin/bash
  2 #make sitemaps in different formats
  3 # v0.3.4  oct/2020  by mountaineerbr
  4 
  5 #initial ideas were taken from
  6 #Poor Man's Webmaster Tools
  7 #specially Koen Noens scripts
  8 
  9 #Based on Google & Bing's sitemap guidelines, XML sitemaps
 10 #shouldn't contain more than 50,000 URLs and should be no
 11 #larger than 50Mb when uncompressed. So in case of a larger
 12 #site with many URLs, you can create multiple sitemap files. 
 13 #no more than 10M(safer)-50M uncompressed or 50K links
 14 #needs to verify ownership and submit sitemap.xml to search
 15 #provideres as they don't read sitemap.xml by defaults.
 16 #base urls matter: http vs https.
 17 #add Sitemap entry to robots.txt.
 18 #https://www.sitemaps.org/protocol.html
 19 #https://support.google.com/webmasters/answer/183668?hl=en
 20 #https://www.bing.com/webmaster/help/sitemaps-3b5cf6ed
 21 #localised versions (alternative languages):
 22 #https://support.google.com/webmasters/answer/189077#sitemap
 23 
 24 #local home page root
 25 LOCAL_ROOT="$HOME/www/mountaineerbr.github.io"
 26 #website root (without the ending slash!)
 27 SITE_ROOT="https://mountaineerbr.github.io"
 28 
 29 #find files with these extensions
 30 EXTENSIONS=( htm html php asp aspx jsp )
 31 #exts for `tree` (should be equivalent to $EXTENSIONS)
 32 EXTENSIONSTREE='*.htm|*.html|*.php|*.asp|*.aspx|*.jsp|sitemap.txt'
 33 
 34 #exclude patterns from the sitemaps
 35 EXARR=(
 36         #valid pattern must run in `sed -E "s,PATTERN,,"`
 37         #do escape \ as \\
 38 
 39         '.*google.*'
 40 
 41         '.*/\\..*'
 42         '.*/[a-z]/.*'
 43         '.*/bak/.*'
 44         '.*/css/.*'
 45         '.*/gfx/.*'
 46         '.*/js/.*'
 47         '.*/misc/.*'
 48         '.*/PMWMT/.*'
 49         '.*/res/.*'
 50 
 51         'index\.html$'
 52         '.*/[a-z]\.html$'
 53         '.*/fool\.html$'
 54 )
 55 #exclude for `tree` (should be equivalent to $EXARR[@])
 56 EXTREE='[a-z].html|[a-z]|index.html|fool.html|bak|css|gfx|js|res|misc|google*|PMWMT'
 57 
 58 #sitemap files
 59 #txt
 60 SMAPTXT="$LOCAL_ROOT/sitemap.txt"
 61 #xml
 62 SMAPXML="$LOCAL_ROOT/sitemap.xml"
 63 #html (directory tree)
 64 SMAPTREE="$LOCAL_ROOT/sitemap.html"
 65 
 66 #temporary found files
 67 SMAPFILES="$LOCAL_ROOT/sitemap.files.txt"
 68 
 69 #xml parts
 70 XMLHEAD='<?xml version="1.0" encoding="UTF-8"?>
 71 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
 72 XMLTAIL='</urlset>'
 73 
 74 #html parts
 75 HTMLHEAD='<meta http-equiv="content-type" content="text/html; charset=UTF-8">
 76 <title>Website map, navigate to all pages</title>
 77 <meta name="resource-type" content="document">
 78 <meta name="description" content="Site map for human visitors; this navigation page may be preferable for some people to use">
 79 <meta name="keywords" content="navigation, navegação, accessibility, acessibilidade, interface, alternativo, alternative, user navigation, navegação de usuário, discover the webste, descubra o website">
 80 <meta name="distribution" content="global">
 81 <meta name="viewport" content="width=device-width, initial-scale=1.0">
 82 <!-- <link rev="made" href="mailto:jamilbio20[[at]]gmail[[dot]]com"> -->
 83 <link rel="shortcut icon" href="favicon.ico" type="image/x-icon">'
 84 
 85 
 86 #functions
 87 
 88 #entity escaping
 89 #and change local path to site url
 90 escf()
 91 {
 92         local url
 93 
 94         #change local root to website root
 95         url="${1/"$LOCAL_ROOT"/"$SITE_ROOT"}"
 96 
 97         #escape url entities
 98         <<<"$url" sed \
 99                 -e 's/&/\&amp;/g' \
100                 -e "s/'/\&apos;/g" \
101                 -e 's/"/\&quot;/g' \
102                 -e 's/>/\&gt;/g' \
103                 -e 's/</\&lt;/g'
104 }
105 
106 
107 
108 
109 #start
110 
111 #exit on any error
112 set -e
113 
114 #PART ZERO
115 #make file lists
116 
117 #cd into webpage root directory
118 cd "$LOCAL_ROOT"
119 
120 #remove previously generated files
121 for r in "$SMAPTXT" "$SMAPXML" "$SMAPTREE" "$SMAPFILES"
122 do
123         [[ -f "$r" ]] || continue
124         rm -v "$r"
125         : >"$r"
126 done
127 unset r
128 
129 #find files
130 #ignore file path with /. (hidden files and directories)
131 for ext in "${EXTENSIONS[@]}"
132 do
133         find "$LOCAL_ROOT" \( ! -path '*/.*' \) -name "*.$ext" >>"$SMAPFILES"
134 done
135 unset ext
136 #https://superuser.com/questions/152958/exclude-hidden-files-when-searching-with-unix-linux-find
137 #add slash after directories
138 #find "$LOCAL_ROOT" \( ! -path '*/.*' \) -type d -exec sh -c 'printf "%s/\n" "$0"' {} \;
139 #https://unix.stackexchange.com/questions/4847/make-find-show-slash-after-directories
140 
141 #add items to sitemap files
142 #these files may have been excluded
143 #at the start of script.
144 #for sitemap.txt and .xml
145 cat >> "$SMAPFILES" <<!
146 $SMAPTXT
147 $SMAPXML
148 $SMAPTREE
149 !
150 
151 #exclude list
152 #run the exclusion array
153 empty=""
154 for entry in "${EXARR[@]}"
155 do
156         sed -i -E "s,$entry,$empty,g" "$SMAPFILES"
157 done
158 unset empty entry
159 
160 #remove blank lines from path lists
161 sed -i '/^\s*$/d' "$SMAPFILES"
162 
163 #sort path lists
164 sort -f -V -u -o "$SMAPFILES" "$SMAPFILES"
165 
166 
167 
168 
169 #PART ONE
170 #TXT
171 #add the site root to build urls        
172 sed "s,$LOCAL_ROOT,$SITE_ROOT," "$SMAPFILES" > "$SMAPTXT"
173 
174 
175 
176 
177 #PART TWO
178 #XML
179 {
180         #xml top
181         echo "$XMLHEAD"
182 
183         #make url entries
184         while IFS=  read
185         do
186                 #counter
187                 (( ++n ))
188 
189                 #escape urls
190                 URL="$( escf "$REPLY" )"
191 
192                 #last modification date
193                 MOD="$( TZ=0 stat --format="%Y" "$REPLY" )"
194                 MOD="$( date -Isec -d@"$MOD" )"
195 
196                 echo -e '\t<url>'
197                 echo -e "\t\t<loc>${URL}</loc>"
198                 echo -e "\t\t<lastmod>${MOD}</lastmod>"
199                 echo -e '\t</url>'
200 
201         done <"$SMAPFILES"
202 
203         #make a timestamp
204         TS="$( date -Isec )"
205 
206         #xml bottom
207         echo "$XMLTAIL"
208         echo "<!-- generated-on=\"$TS\" -->"
209         echo "<!-- items=\"$n\" -->"
210 
211 } >"$SMAPXML"
212 unset REPLY n URL ALT MOD TS
213 #optional attributes:
214 #lastmod, changefreq and priority
215 
216 
217 # finishing touches
218 #clean up
219 rm -v "$SMAPFILES"
220 
221 
222 
223 
224 #PART THREE
225 #HTML
226 #create directory tree
227 #remove default meta tags
228 tree -H "$SITE_ROOT" -P "$EXTENSIONSTREE" -I "$EXTREE" \
229         -T Sitemap -L 6 -F -v --noreport --charset utf-8 |
230         sed '/<meta/,/<title/ d' > "$SMAPTREE"
231 
232 #add custom meta tags
233 sed -i '/<head>/ r /dev/stdin' "$SMAPTREE" <<<"$HTMLHEAD"
234 
235 
236 
237 
238 #PART FOUR
239 #optionally ping search engines with HTTP GET request
240 #or submit sitemap to their respective webmaster tools pages
241 
242 #google
243 #ping:http://www.google.com/ping?sitemap=https://example.com/sitemap.xml
244 #https://support.google.com/webmasters/answer/183668?hl=en#addsitemap
245 #https://search.google.com/search-console/sitemaps
246 
247 #bing & yahoo!
248 #ping:http://www.bing.com/ping?sitemap=http%3A%2F%2Fwww.example.com/sitemap.xml
249 #https://www.bing.com/webmaster/help/how-to-submit-sitemaps-82a15bd4
250 #https://www.bing.com/webmasters/sitemaps
251 
252 #duckduckgo
253 #We get our results from multiple sources so there's no place to submit
254 #them to DuckDuckGo directly. Once your site is indexed by our sources,
255 #it should show on DuckDuckGo correctly. 
256 
257 #There's no direct way to submit your website URL to Yahoo! and AOL.
258 #All search results at Yahoo! and AOL are now powered by Bing.
259 #Ask.com no longer allows you to submit sitemaps.
260 
261 #ask.com
262 #ping:http://submissions.ask.com/ping?sitemap=http://<The Domain Name>/sitemapxml.aspx
263 #ping:http://submissions.ask.com/ping?sitemap=http%3A//www.URL.com/sitemap.xml
264 #Launch your Web browser and copy and paste the entire submission URL,
265 #including your sitemap, into the browser address bar and press "Enter."
266 #A confirmation message from Ask.com appears in the browser.
267 
268