workaround for spurious pages with empty content type

git-svn-id: https://public-transport-enabler.googlecode.com/svn/trunk@390 0924bc21-9374-b0fa-ee44-9ff1593b38f0
This commit is contained in:
andreas.schildbach@gmail.com 2010-12-13 23:13:04 +00:00
parent d3df888509
commit c63dccc326

View file

@ -167,33 +167,47 @@ public final class ParserUtils
public static final InputStream scrapeInputStream(final String url) throws IOException public static final InputStream scrapeInputStream(final String url) throws IOException
{ {
return scrapeInputStream(url, null); return scrapeInputStream(url, null, 3);
} }
public static final InputStream scrapeInputStream(final String url, final String postRequest) throws IOException public static final InputStream scrapeInputStream(final String url, final String postRequest, int tries) throws IOException
{ {
final HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection(); while (true)
connection.setDoInput(true);
connection.setDoOutput(postRequest != null);
connection.setConnectTimeout(SCRAPE_CONNECT_TIMEOUT);
connection.setReadTimeout(SCRAPE_READ_TIMEOUT);
connection.addRequestProperty("User-Agent", SCRAPE_USER_AGENT);
// workaround to disable Vodafone compression
connection.addRequestProperty("Cache-Control", "no-cache");
if (postRequest != null)
{ {
connection.setRequestMethod("POST"); final HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection();
connection.addRequestProperty("Content-Type", "application/x-www-form-urlencoded");
connection.addRequestProperty("Content-Length", Integer.toString(postRequest.length()));
final Writer writer = new OutputStreamWriter(connection.getOutputStream(), SCRAPE_DEFAULT_ENCODING); connection.setDoInput(true);
writer.write(postRequest); connection.setDoOutput(postRequest != null);
writer.close(); connection.setConnectTimeout(SCRAPE_CONNECT_TIMEOUT);
connection.setReadTimeout(SCRAPE_READ_TIMEOUT);
connection.addRequestProperty("User-Agent", SCRAPE_USER_AGENT);
// workaround to disable Vodafone compression
connection.addRequestProperty("Cache-Control", "no-cache");
if (postRequest != null)
{
connection.setRequestMethod("POST");
connection.addRequestProperty("Content-Type", "application/x-www-form-urlencoded");
connection.addRequestProperty("Content-Length", Integer.toString(postRequest.length()));
final Writer writer = new OutputStreamWriter(connection.getOutputStream(), SCRAPE_DEFAULT_ENCODING);
writer.write(postRequest);
writer.close();
}
if (connection.getContentType() != null)
{
return connection.getInputStream();
}
else
{
final String message = "got page without content type";
if (tries-- > 0)
System.out.println(message + ", retrying...");
else
throw new IOException(message + ": " + url);
}
} }
return connection.getInputStream();
} }
private static final Pattern P_ENTITY = Pattern.compile("&(?:#(x[\\da-f]+|\\d+)|(amp|quot|apos));"); private static final Pattern P_ENTITY = Pattern.compile("&(?:#(x[\\da-f]+|\\d+)|(amp|quot|apos));");