mirror of
https://gitlab.com/oeffi/public-transport-enabler.git
synced 2025-07-14 08:40:29 +00:00
When scraping, detect several alternative forms of redirects.
This commit is contained in:
parent
5e47d29312
commit
12e08bbb40
2 changed files with 55 additions and 5 deletions
|
@ -160,7 +160,7 @@ public final class ParserUtils
|
|||
if (!url.getHost().equals(connection.getURL().getHost()))
|
||||
throw new UnexpectedRedirectException(url, connection.getURL());
|
||||
|
||||
final URL redirectUrl = testRedirect(peekFirstChars(is));
|
||||
final URL redirectUrl = testRedirect(url, peekFirstChars(is));
|
||||
if (redirectUrl != null)
|
||||
throw new UnexpectedRedirectException(url, redirectUrl);
|
||||
|
||||
|
@ -258,15 +258,24 @@ public final class ParserUtils
|
|||
return new String(firstBytes, 0, read).replaceAll("\\p{C}", "");
|
||||
}
|
||||
|
||||
private static final Pattern P_REDIRECT_HTTP_EQUIV = Pattern.compile("<META\\s+http-equiv=\"refresh\"\\s+content=\"\\d+;\\s*URL=([^\"]+)\"",
|
||||
private static final Pattern P_REDIRECT_HTTP_EQUIV = Pattern.compile("<META\\s+http-equiv=\"?refresh\"?\\s+content=\"\\d+;\\s*URL=([^\"]+)\"",
|
||||
Pattern.CASE_INSENSITIVE);
|
||||
|
||||
public static URL testRedirect(final String content) throws MalformedURLException
|
||||
private static final Pattern P_REDIRECT_SCRIPT = Pattern.compile(
|
||||
"<script\\s+(?:type=\"text/javascript\"|language=\"javascript\")>\\s*(?:window.location|location.href)\\s*=\\s*\"([^\"]+)\"",
|
||||
Pattern.CASE_INSENSITIVE);
|
||||
|
||||
public static URL testRedirect(final URL context, final String content) throws MalformedURLException
|
||||
{
|
||||
// check for redirect by http-equiv meta tag header
|
||||
final Matcher mHttpEquiv = P_REDIRECT_HTTP_EQUIV.matcher(content);
|
||||
if (mHttpEquiv.find())
|
||||
return new URL(mHttpEquiv.group(1));
|
||||
return new URL(context, mHttpEquiv.group(1));
|
||||
|
||||
// check for redirect by window.location javascript
|
||||
final Matcher mScript = P_REDIRECT_SCRIPT.matcher(content);
|
||||
if (mScript.find())
|
||||
return new URL(context, mScript.group(1));
|
||||
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ import static org.junit.Assert.assertNotNull;
|
|||
|
||||
import java.net.URL;
|
||||
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
|
@ -29,12 +30,52 @@ import org.junit.Test;
|
|||
*/
|
||||
public class ParserUtilsTest
|
||||
{
|
||||
private URL context;
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception
|
||||
{
|
||||
context = new URL("http://example.com");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void vodafoneRedirect() throws Exception
|
||||
{
|
||||
final URL url = ParserUtils
|
||||
.testRedirect("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html PUBLIC \"-//WAPFORUM//DTD XHTML Mobile 1.1//EN \" \"http://www.openmobilealliance.org/tech/DTD/xhtml-mobile11.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\"; xml:lang=\"en\"><head><title>Vodafone Center</title><meta http-equiv=\"Cache-Control\" content=\"no-cache\"/><meta http-equiv=\"refresh\" content=\"1;URL=https://center.vodafone.de/vfcenter/index.html?targetUrl=http%3A%2F%2Fwww.fahrinfo-berlin.de/Fahrinfo/bin/query.bin/dn%3fstart=Suchen&REQ0JourneyStopsS0ID=A%253D1%2540L%253D9083301&REQ0JourneyStopsZ0ID=A%253D1%2540L%253D9195009&REQ0HafasSearchForw=1&REQ0JourneyDate=16.06.14&REQ0JourneyTime=16%253A32&REQ0JourneyProduct_prod_list_1=11111011&h2g-direct=11&L=vs_oeffi\"/><style type=\"text/css\">*{border:none;font-family:Arial,Helvetica,sans-serif} body{font-size:69%;line-height:140%;background-color:#F4F4F4 !important}</style></head><body><h1>Sie werden weitergeleitet ...</h1><p>Sollten Sie nicht weitergeleitet werden, klicken Sie bitte <a href=\"https://center.vodafo");
|
||||
.testRedirect(
|
||||
context,
|
||||
"<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html PUBLIC \"-//WAPFORUM//DTD XHTML Mobile 1.1//EN \" \"http://www.openmobilealliance.org/tech/DTD/xhtml-mobile11.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\"; xml:lang=\"en\"><head><title>Vodafone Center</title><meta http-equiv=\"Cache-Control\" content=\"no-cache\"/><meta http-equiv=\"refresh\" content=\"1;URL=https://center.vodafone.de/vfcenter/index.html?targetUrl=http%3A%2F%2Fwww.fahrinfo-berlin.de/Fahrinfo/bin/query.bin/dn%3fstart=Suchen&REQ0JourneyStopsS0ID=A%253D1%2540L%253D9083301&REQ0JourneyStopsZ0ID=A%253D1%2540L%253D9195009&REQ0HafasSearchForw=1&REQ0JourneyDate=16.06.14&REQ0JourneyTime=16%253A32&REQ0JourneyProduct_prod_list_1=11111011&h2g-direct=11&L=vs_oeffi\"/><style type=\"text/css\">*{border:none;font-family:Arial,Helvetica,sans-serif} body{font-size:69%;line-height:140%;background-color:#F4F4F4 !important}</style></head><body><h1>Sie werden weitergeleitet ...</h1><p>Sollten Sie nicht weitergeleitet werden, klicken Sie bitte <a href=\"https://center.vodafo");
|
||||
assertNotNull(url);
|
||||
assertEquals("center.vodafone.de", url.getHost());
|
||||
}
|
||||
|
||||
public void kabelDeutschlandRedirect() throws Exception
|
||||
{
|
||||
final URL url = ParserUtils
|
||||
.testRedirect(
|
||||
context,
|
||||
"<script type=\"text/javascript\"> window.location = \"http://www.hotspot.kabeldeutschland.de/portal/?RequestedURI=http%3A%2F%2Fwww.fahrinfo-berlin.de%2FFahrinfo%2Fbin%2Fajax-getstop.bin%2Fdny%3Fgetstop%3D1%26REQ0JourneyStopsS0A%3D255%26REQ0JourneyStopsS0G%3Dgneisenustra%25DFe%3F%26js%3Dtrue&RedirectReason=Policy&RedirectAqpId=100&DiscardAqpId=100&SubscriberId=4fa432d4a653e5f8b2acb27aa862f98d&SubscriberType=ESM&ClientIP=10.136.25.241&SystemId=10.143.181.2-1%2F2&GroupId=1&PartitionId=2&Application=Unknown&ApplicationGroup=Unknown\" </script>");
|
||||
assertNotNull(url);
|
||||
assertEquals("www.hotspot.kabeldeutschland.de", url.getHost());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tplinkRedirect() throws Exception
|
||||
{
|
||||
final URL url = ParserUtils.testRedirect(context,
|
||||
"<body><script language=\"javaScript\">location.href=\"http://tplinkextender.net/\";</script></body></html>");
|
||||
assertNotNull(url);
|
||||
assertEquals("tplinkextender.net", url.getHost());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void mshtmlRedirect() throws Exception
|
||||
{
|
||||
final URL url = ParserUtils
|
||||
.testRedirect(
|
||||
context,
|
||||
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><HEAD><TITLE>HTML Redirection</TITLE><META http-equiv=Content-Type content=\"text/html; \"><META http-equiv=Refresh content=\"0;URL=/cgi-bin/index.cgi\"><META content=\"MSHTML 6.00.2900.2873\" name=GENERATOR></HEAD><BODY > <NOSCRIPT> If your browser can not redirect you to home page automatically.<br> Please click <a href=/cgi-bin/welcome.cgi?lang=0>here</a>. </NOSCRIPT></BODY></HTML>");
|
||||
assertNotNull(url);
|
||||
assertEquals("example.com", url.getHost());
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue