From 27894dbf78f60ebb98d52fec0e1caf1a0ef092e7 Mon Sep 17 00:00:00 2001 From: Andreas Schildbach Date: Mon, 16 Jun 2014 11:47:18 +0200 Subject: [PATCH] Make ParserUtil.scrape() use .scrapeInputStream(). It duplicated a lot of code. Add testcase for detecting redirect by http-equiv meta tag. --- .../de/schildbach/pte/util/ParserUtils.java | 159 +++--------------- .../schildbach/pte/util/ParserUtilsTest.java | 40 +++++ 2 files changed, 68 insertions(+), 131 deletions(-) create mode 100644 enabler/test/de/schildbach/pte/util/ParserUtilsTest.java diff --git a/enabler/src/de/schildbach/pte/util/ParserUtils.java b/enabler/src/de/schildbach/pte/util/ParserUtils.java index f6bf02cb..c58d62fb 100644 --- a/enabler/src/de/schildbach/pte/util/ParserUtils.java +++ b/enabler/src/de/schildbach/pte/util/ParserUtils.java @@ -26,7 +26,7 @@ import java.io.OutputStream; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; -import java.net.SocketTimeoutException; +import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; import java.net.URLEncoder; @@ -55,9 +55,6 @@ public final class ParserUtils private static final int SCRAPE_CONNECT_TIMEOUT = 5000; private static final int SCRAPE_READ_TIMEOUT = 15000; private static final Charset SCRAPE_DEFAULT_ENCODING = Charset.forName("ISO-8859-1"); - private static final int SCRAPE_PAGE_EMPTY_THRESHOLD = 2; - private static final Pattern P_REFRESH = Pattern.compile(" SCRAPE_PAGE_EMPTY_THRESHOLD) - { - final Matcher mRefresh = P_REFRESH.matcher(buffer); - if (!mRefresh.find()) - { - if (sessionCookieName != null) - { - for (final Map.Entry> entry : connection.getHeaderFields().entrySet()) - { - if ("set-cookie".equalsIgnoreCase(entry.getKey())) - { - for (final String value : entry.getValue()) - { - if (value.startsWith(sessionCookieName)) - { - stateCookie = value.split(";", 2)[0]; - } - } - } - } - } - - return buffer; - } - else - { - throw new UnexpectedRedirectException(url, new URL(mRefresh.group(1))); - } - } - else - { - final String message = "got empty page (length: " + buffer.length() + ")"; - if (tries-- > 0) - System.out.println(message + ", retrying..."); - else - throw new IOException(message + ": " + url); - } - } - else if (responseCode == HttpURLConnection.HTTP_FORBIDDEN || responseCode == HttpURLConnection.HTTP_BAD_REQUEST - || responseCode == HttpURLConnection.HTTP_NOT_ACCEPTABLE || responseCode == HttpURLConnection.HTTP_UNAVAILABLE) - { - throw new BlockedException(url); - } - else if (responseCode == HttpURLConnection.HTTP_NOT_FOUND) - { - throw new FileNotFoundException(url.toString()); - } - else if (responseCode == HttpURLConnection.HTTP_MOVED_PERM || responseCode == HttpURLConnection.HTTP_MOVED_TEMP) - { - throw new UnexpectedRedirectException(url, connection.getURL()); - } - else if (responseCode == HttpURLConnection.HTTP_INTERNAL_ERROR) - { - throw new InternalErrorException(url); - } - else - { - final String message = "got response: " + responseCode + " " + connection.getResponseMessage(); - if (tries-- > 0) - System.out.println(message + ", retrying..."); - else - throw new IOException(message + ": " + url); - } - } - catch (final SocketTimeoutException x) - { - if (tries-- > 0) - System.out.println("socket timed out, retrying..."); - else - throw x; - } - } + final StringBuilder buffer = new StringBuilder(SCRAPE_INITIAL_CAPACITY); + final InputStream is = scrapeInputStream(urlStr, postRequest, requestEncoding, null, sessionCookieName, tries); + final Reader pageReader = new InputStreamReader(is, requestEncoding); + copy(pageReader, buffer); + pageReader.close(); + return buffer; } private static final long copy(final Reader reader, final StringBuilder builder) throws IOException @@ -280,6 +160,10 @@ public final class ParserUtils if (!url.getHost().equals(connection.getURL().getHost())) throw new UnexpectedRedirectException(url, connection.getURL()); + final URL redirectUrl = testRedirect(peekFirstChars(is)); + if (redirectUrl != null) + throw new UnexpectedRedirectException(url, redirectUrl); + if (sessionCookieName != null) { for (final Map.Entry> entry : connection.getHeaderFields().entrySet()) @@ -373,6 +257,19 @@ public final class ParserUtils return new String(firstBytes, 0, read).replaceAll("\\p{C}", ""); } + private static final Pattern P_REDIRECT_HTTP_EQUIV = Pattern.compile(". + */ + +package de.schildbach.pte.util; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +import java.net.URL; + +import org.junit.Test; + +/** + * @author Andreas Schildbach + */ +public class ParserUtilsTest +{ + @Test + public void vodafoneRedirect() throws Exception + { + final URL url = ParserUtils + .testRedirect("Vodafone Center

Sie werden weitergeleitet ...

Sollten Sie nicht weitergeleitet werden, klicken Sie bitte