| # test simple removal of session id, keeping parameters before and after |
| http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php |
| http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php?f=2 |
| http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&q=3 http://foo.com/foo.php?f=2&q=3 |
| http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&f=2 http://foo.com/foo.php?f=2 |
| |
| # test removal of different session ids including removal of ; in jsessionid |
| http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl http://www.foo.com/foo.php |
| http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl&x=y http://www.foo.com/foo.php?x=y |
| http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://www.foo.com/foo.html |
| http://www.foo.com/foo.html?param=1&another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://www.foo.com/foo.html?param=1&another=2 |
| http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1&another=2 http://www.foo.com/foo.html?param=1&another=2 |
| http://www.foo.com/foo.php?&x=1&sid=xyz&something=1 http://www.foo.com/foo.php?x=1&something=1 |
| http://www.foo.com/foo.html?_sessionID=824A6C0A13a7e11205wxN28F44E3 http://www.foo.com/foo.html |
| http://www.foo.com/foo.php?_sessionid=qmyrcedt&outputformat=html&path=/3_images/foo http://www.foo.com/foo.php?outputformat=html&path=/3_images/foo |
| http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en&_sessionid=e36902d5bb2d0d922fc24b43 http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en |
| http://www.foo.com/foo.php?app=content&content=overview&lang=en&_sid=587fba8f825b05844526519fdb7d75c8&b=35&m=47 http://www.foo.com/foo.php?app=content&content=overview&lang=en&b=35&m=47 |
| # but NewsId (and similar) is not a session id (NUTCH-706, NUTCH-1328) |
| http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 |
| http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0 http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0 |
| |
| # test removal default pages |
| http://www.foo.com/home/index.html http://www.foo.com/home/ |
| http://www.foo.com/index.html http://www.foo.com/ |
| http://www.foo.com/index.htm http://www.foo.com/ |
| http://www.foo.com/index.asp http://www.foo.com/ |
| http://www.foo.com/index.aspx http://www.foo.com/ |
| http://www.foo.com/index.php http://www.foo.com/ |
| http://www.foo.com/index.php3 http://www.foo.com/ |
| http://www.foo.com/default.html http://www.foo.com/ |
| http://www.foo.com/default.htm http://www.foo.com/ |
| http://www.foo.com/default.asp http://www.foo.com/ |
| http://www.foo.com/default.aspx http://www.foo.com/ |
| http://www.foo.com/default.php http://www.foo.com/ |
| http://www.foo.com/default.php3 http://www.foo.com/ |
| http://www.foo.com/something.php3 http://www.foo.com/something.php3 |
| http://www.foo.com/something.html http://www.foo.com/something.html |
| http://www.foo.com/something.asp http://www.foo.com/something.asp |
| http://www.foo.com/index.phtml http://www.foo.com/ |
| http://www.foo.com/index.cfm http://www.foo.com/ |
| http://www.foo.com/index.cgi http://www.foo.com/ |
| http://www.foo.com/index.HTML http://www.foo.com/ |
| http://www.foo.com/index.Htm http://www.foo.com/ |
| http://www.foo.com/index.ASP http://www.foo.com/ |
| http://www.foo.com/index.jsp http://www.foo.com/ |
| http://www.foo.com/index.jsf http://www.foo.com/ |
| http://www.foo.com/index.jspx http://www.foo.com/ |
| http://www.foo.com/index.jspfx http://www.foo.com/index.jspfx |
| http://www.foo.com/index.jspa http://www.foo.com/ |
| http://www.foo.com/index.jsps http://www.foo.com/index.jsps |
| http://www.foo.com/index.aspX http://www.foo.com/ |
| http://www.foo.com/index.PhP http://www.foo.com/ |
| http://www.foo.com/index.PhP4 http://www.foo.com/ |
| http://www.foo.com/default.HTml http://www.foo.com/ |
| http://www.foo.com/default.HTm http://www.foo.com/ |
| http://www.foo.com/default.ASp http://www.foo.com/ |
| http://www.foo.com/default.AspX http://www.foo.com/ |
| http://www.foo.com/default.PHP http://www.foo.com/ |
| http://www.foo.com/default.PHP3 http://www.foo.com/ |
| http://www.foo.com/index.phtml http://www.foo.com/ |
| http://www.foo.com/index.cfm http://www.foo.com/ |
| http://www.foo.com/index.cgi http://www.foo.com/ |
| |
| # ensure keeping non-default pages |
| http://www.foo.com/foo.php3 http://www.foo.com/foo.php3 |
| http://www.foo.com/foo.html http://www.foo.com/foo.html |
| http://www.foo.com/foo.asp http://www.foo.com/foo.asp |
| |
| # test removal of interpage anchors and keeping query string |
| http://www.foo.com/foo.html#something http://www.foo.com/foo.html |
| http://www.foo.com/foo.html#something?x=y http://www.foo.com/foo.html?x=y |
| |
| # test general cleaning of bad urls |
| http://www.foo.com/foo.html?&x=y http://www.foo.com/foo.html?x=y |
| http://www.foo.com/foo.html?x=y&&&z=a http://www.foo.com/foo.html?x=y&z=a |
| http://www.foo.com/foo.html? http://www.foo.com/foo.html |
| |
| # remove double slashes but keep 2 slashes after protocol |
| http://www.foo.com//path//foo.html http://www.foo.com/path/foo.html |
| https://www.foo.com//path//foo.html https://www.foo.com/path/foo.html |
| |
| # normalize file: protocol prefix (keep one slash) |
| file:///path//foo.html file:/path/foo.html |
| file:/path//foo.html file:/path/foo.html |