blob: 7867ad80ea794889f6efdc69c7555b7c84a24b27 [file] [log] [blame]
# test simple removal of session id, keeping parameters before and after
http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php
http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php?f=2
http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&q=3 http://foo.com/foo.php?f=2&q=3
http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&f=2 http://foo.com/foo.php?f=2
# test removal of different session ids including removal of ; in jsessionid
http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl http://www.foo.com/foo.php
http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl&x=y http://www.foo.com/foo.php?x=y
http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://www.foo.com/foo.html
http://www.foo.com/foo.html?param=1&another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://www.foo.com/foo.html?param=1&another=2
http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1&another=2 http://www.foo.com/foo.html?param=1&another=2
http://www.foo.com/foo.php?&x=1&sid=xyz&something=1 http://www.foo.com/foo.php?x=1&something=1
http://www.foo.com/foo.html?_sessionID=824A6C0A13a7e11205wxN28F44E3 http://www.foo.com/foo.html
http://www.foo.com/foo.php?_sessionid=qmyrcedt&outputformat=html&path=/3_images/foo http://www.foo.com/foo.php?outputformat=html&path=/3_images/foo
http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en&_sessionid=e36902d5bb2d0d922fc24b43 http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en
http://www.foo.com/foo.php?app=content&content=overview&lang=en&_sid=587fba8f825b05844526519fdb7d75c8&b=35&m=47 http://www.foo.com/foo.php?app=content&content=overview&lang=en&b=35&m=47
# but NewsId (and similar) is not a session id (NUTCH-706, NUTCH-1328)
http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539
http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0 http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0
# test removal default pages
http://www.foo.com/home/index.html http://www.foo.com/home/
http://www.foo.com/index.html http://www.foo.com/
http://www.foo.com/index.htm http://www.foo.com/
http://www.foo.com/index.asp http://www.foo.com/
http://www.foo.com/index.aspx http://www.foo.com/
http://www.foo.com/index.php http://www.foo.com/
http://www.foo.com/index.php3 http://www.foo.com/
http://www.foo.com/default.html http://www.foo.com/
http://www.foo.com/default.htm http://www.foo.com/
http://www.foo.com/default.asp http://www.foo.com/
http://www.foo.com/default.aspx http://www.foo.com/
http://www.foo.com/default.php http://www.foo.com/
http://www.foo.com/default.php3 http://www.foo.com/
http://www.foo.com/something.php3 http://www.foo.com/something.php3
http://www.foo.com/something.html http://www.foo.com/something.html
http://www.foo.com/something.asp http://www.foo.com/something.asp
http://www.foo.com/index.phtml http://www.foo.com/
http://www.foo.com/index.cfm http://www.foo.com/
http://www.foo.com/index.cgi http://www.foo.com/
http://www.foo.com/index.HTML http://www.foo.com/
http://www.foo.com/index.Htm http://www.foo.com/
http://www.foo.com/index.ASP http://www.foo.com/
http://www.foo.com/index.jsp http://www.foo.com/
http://www.foo.com/index.jsf http://www.foo.com/
http://www.foo.com/index.jspx http://www.foo.com/
http://www.foo.com/index.jspfx http://www.foo.com/index.jspfx
http://www.foo.com/index.jspa http://www.foo.com/
http://www.foo.com/index.jsps http://www.foo.com/index.jsps
http://www.foo.com/index.aspX http://www.foo.com/
http://www.foo.com/index.PhP http://www.foo.com/
http://www.foo.com/index.PhP4 http://www.foo.com/
http://www.foo.com/default.HTml http://www.foo.com/
http://www.foo.com/default.HTm http://www.foo.com/
http://www.foo.com/default.ASp http://www.foo.com/
http://www.foo.com/default.AspX http://www.foo.com/
http://www.foo.com/default.PHP http://www.foo.com/
http://www.foo.com/default.PHP3 http://www.foo.com/
http://www.foo.com/index.phtml http://www.foo.com/
http://www.foo.com/index.cfm http://www.foo.com/
http://www.foo.com/index.cgi http://www.foo.com/
# ensure keeping non-default pages
http://www.foo.com/foo.php3 http://www.foo.com/foo.php3
http://www.foo.com/foo.html http://www.foo.com/foo.html
http://www.foo.com/foo.asp http://www.foo.com/foo.asp
# test removal of interpage anchors and keeping query string
http://www.foo.com/foo.html#something http://www.foo.com/foo.html
http://www.foo.com/foo.html#something?x=y http://www.foo.com/foo.html?x=y
# test general cleaning of bad urls
http://www.foo.com/foo.html?&x=y http://www.foo.com/foo.html?x=y
http://www.foo.com/foo.html?x=y&&&z=a http://www.foo.com/foo.html?x=y&z=a
http://www.foo.com/foo.html? http://www.foo.com/foo.html
# remove double slashes but keep 2 slashes after protocol
http://www.foo.com//path//foo.html http://www.foo.com/path/foo.html
https://www.foo.com//path//foo.html https://www.foo.com/path/foo.html
# normalize file: protocol prefix (keep one slash)
file:///path//foo.html file:/path/foo.html
file:/path//foo.html file:/path/foo.html