Make ParserUtil.scrape() use .scrapeInputStream(). It duplicated a lot of code. Add testcase for detecting redirect by http-equiv meta tag.

This commit is contained in:
Andreas Schildbach 2014-06-16 11:47:18 +02:00
parent 6438b71fad
commit 27894dbf78
2 changed files with 68 additions and 131 deletions

View file

@ -26,7 +26,7 @@ import java.io.OutputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.SocketTimeoutException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
@ -55,9 +55,6 @@ public final class ParserUtils
private static final int SCRAPE_CONNECT_TIMEOUT = 5000;
private static final int SCRAPE_READ_TIMEOUT = 15000;
private static final Charset SCRAPE_DEFAULT_ENCODING = Charset.forName("ISO-8859-1");
private static final int SCRAPE_PAGE_EMPTY_THRESHOLD = 2;
private static final Pattern P_REFRESH = Pattern.compile("<META\\s+http-equiv=\"refresh\"\\s+content=\"\\d+;\\s*URL=([^\"]+)\"",
Pattern.CASE_INSENSITIVE);
private static String stateCookie;
@ -77,135 +74,18 @@ public final class ParserUtils
return scrape(url, postRequest, encoding, sessionCookieName, 3);
}
public static final CharSequence scrape(final String urlStr, final String postRequest, Charset encoding, final String sessionCookieName, int tries)
throws IOException
public static final CharSequence scrape(final String urlStr, final String postRequest, Charset requestEncoding, final String sessionCookieName,
int tries) throws IOException
{
if (encoding == null)
encoding = SCRAPE_DEFAULT_ENCODING;
if (requestEncoding == null)
requestEncoding = SCRAPE_DEFAULT_ENCODING;
while (true)
{
try
{
final StringBuilder buffer = new StringBuilder(SCRAPE_INITIAL_CAPACITY);
final URL url = new URL(urlStr);
final HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setDoInput(true);
connection.setDoOutput(postRequest != null);
connection.setConnectTimeout(SCRAPE_CONNECT_TIMEOUT);
connection.setReadTimeout(SCRAPE_READ_TIMEOUT);
connection.addRequestProperty("User-Agent", SCRAPE_USER_AGENT);
connection.addRequestProperty("Accept", SCRAPE_ACCEPT);
connection.addRequestProperty("Accept-Encoding", "gzip");
// workaround to disable Vodafone compression
connection.addRequestProperty("Cache-Control", "no-cache");
if (sessionCookieName != null && stateCookie != null)
connection.addRequestProperty("Cookie", stateCookie);
if (postRequest != null)
{
final byte[] postRequestBytes = postRequest.getBytes(encoding.name());
connection.setRequestMethod("POST");
connection.addRequestProperty("Content-Type", "application/x-www-form-urlencoded");
connection.addRequestProperty("Content-Length", Integer.toString(postRequestBytes.length));
final OutputStream os = connection.getOutputStream();
os.write(postRequestBytes);
os.close();
}
final int responseCode = connection.getResponseCode();
if (responseCode == HttpURLConnection.HTTP_OK)
{
final String contentType = connection.getContentType();
final String contentEncoding = connection.getContentEncoding();
if (!url.getHost().equals(connection.getURL().getHost()))
throw new UnexpectedRedirectException(url, connection.getURL());
InputStream is = new BufferedInputStream(connection.getInputStream());
if ("gzip".equalsIgnoreCase(contentEncoding) || "application/octet-stream".equalsIgnoreCase(contentType))
is = wrapGzip(is);
final Reader pageReader = new InputStreamReader(is, encoding);
copy(pageReader, buffer);
pageReader.close();
if (buffer.length() > SCRAPE_PAGE_EMPTY_THRESHOLD)
{
final Matcher mRefresh = P_REFRESH.matcher(buffer);
if (!mRefresh.find())
{
if (sessionCookieName != null)
{
for (final Map.Entry<String, List<String>> entry : connection.getHeaderFields().entrySet())
{
if ("set-cookie".equalsIgnoreCase(entry.getKey()))
{
for (final String value : entry.getValue())
{
if (value.startsWith(sessionCookieName))
{
stateCookie = value.split(";", 2)[0];
}
}
}
}
}
return buffer;
}
else
{
throw new UnexpectedRedirectException(url, new URL(mRefresh.group(1)));
}
}
else
{
final String message = "got empty page (length: " + buffer.length() + ")";
if (tries-- > 0)
System.out.println(message + ", retrying...");
else
throw new IOException(message + ": " + url);
}
}
else if (responseCode == HttpURLConnection.HTTP_FORBIDDEN || responseCode == HttpURLConnection.HTTP_BAD_REQUEST
|| responseCode == HttpURLConnection.HTTP_NOT_ACCEPTABLE || responseCode == HttpURLConnection.HTTP_UNAVAILABLE)
{
throw new BlockedException(url);
}
else if (responseCode == HttpURLConnection.HTTP_NOT_FOUND)
{
throw new FileNotFoundException(url.toString());
}
else if (responseCode == HttpURLConnection.HTTP_MOVED_PERM || responseCode == HttpURLConnection.HTTP_MOVED_TEMP)
{
throw new UnexpectedRedirectException(url, connection.getURL());
}
else if (responseCode == HttpURLConnection.HTTP_INTERNAL_ERROR)
{
throw new InternalErrorException(url);
}
else
{
final String message = "got response: " + responseCode + " " + connection.getResponseMessage();
if (tries-- > 0)
System.out.println(message + ", retrying...");
else
throw new IOException(message + ": " + url);
}
}
catch (final SocketTimeoutException x)
{
if (tries-- > 0)
System.out.println("socket timed out, retrying...");
else
throw x;
}
}
final StringBuilder buffer = new StringBuilder(SCRAPE_INITIAL_CAPACITY);
final InputStream is = scrapeInputStream(urlStr, postRequest, requestEncoding, null, sessionCookieName, tries);
final Reader pageReader = new InputStreamReader(is, requestEncoding);
copy(pageReader, buffer);
pageReader.close();
return buffer;
}
private static final long copy(final Reader reader, final StringBuilder builder) throws IOException
@ -280,6 +160,10 @@ public final class ParserUtils
if (!url.getHost().equals(connection.getURL().getHost()))
throw new UnexpectedRedirectException(url, connection.getURL());
final URL redirectUrl = testRedirect(peekFirstChars(is));
if (redirectUrl != null)
throw new UnexpectedRedirectException(url, redirectUrl);
if (sessionCookieName != null)
{
for (final Map.Entry<String, List<String>> entry : connection.getHeaderFields().entrySet())
@ -373,6 +257,19 @@ public final class ParserUtils
return new String(firstBytes, 0, read).replaceAll("\\p{C}", "");
}
private static final Pattern P_REDIRECT_HTTP_EQUIV = Pattern.compile("<META\\s+http-equiv=\"refresh\"\\s+content=\"\\d+;\\s*URL=([^\"]+)\"",
Pattern.CASE_INSENSITIVE);
public static URL testRedirect(final String content) throws MalformedURLException
{
// check for redirect by http-equiv meta tag header
final Matcher mHttpEquiv = P_REDIRECT_HTTP_EQUIV.matcher(content);
if (mHttpEquiv.find())
return new URL(mHttpEquiv.group(1));
return null;
}
private static final Pattern P_ENTITY = Pattern.compile("&(?:#(x[\\da-f]+|\\d+)|(amp|quot|apos|szlig|nbsp));");
public static String resolveEntities(final CharSequence str)

View file

@ -0,0 +1,40 @@
/*
* Copyright 2014 the original author or authors.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package de.schildbach.pte.util;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import java.net.URL;
import org.junit.Test;
/**
* @author Andreas Schildbach
*/
public class ParserUtilsTest
{
@Test
public void vodafoneRedirect() throws Exception
{
final URL url = ParserUtils
.testRedirect("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html PUBLIC \"-//WAPFORUM//DTD XHTML Mobile 1.1//EN \" \"http://www.openmobilealliance.org/tech/DTD/xhtml-mobile11.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\"; xml:lang=\"en\"><head><title>Vodafone Center</title><meta http-equiv=\"Cache-Control\" content=\"no-cache\"/><meta http-equiv=\"refresh\" content=\"1;URL=https://center.vodafone.de/vfcenter/index.html?targetUrl=http%3A%2F%2Fwww.fahrinfo-berlin.de/Fahrinfo/bin/query.bin/dn%3fstart=Suchen&REQ0JourneyStopsS0ID=A%253D1%2540L%253D9083301&REQ0JourneyStopsZ0ID=A%253D1%2540L%253D9195009&REQ0HafasSearchForw=1&REQ0JourneyDate=16.06.14&REQ0JourneyTime=16%253A32&REQ0JourneyProduct_prod_list_1=11111011&h2g-direct=11&L=vs_oeffi\"/><style type=\"text/css\">*{border:none;font-family:Arial,Helvetica,sans-serif} body{font-size:69%;line-height:140%;background-color:#F4F4F4 !important}</style></head><body><h1>Sie werden weitergeleitet ...</h1><p>Sollten Sie nicht weitergeleitet werden, klicken Sie bitte <a href=\"https://center.vodafo");
assertNotNull(url);
assertEquals("center.vodafone.de", url.getHost());
}
}