Friday, September 14, 2007

Grab a webpage in Erlang which is gzipped

First post! Instead of the typical blogging mainstay of a stupid first post that really is no value to anyone, I'm going to post some code on how to grab a web page that's been server side compressed with gzip (html header "content-encoding: gzip").

There are a few tutorials out there showing you how to grab a web page in Erlang, but unfortunately they can only be used if you're going to be doing some very light webpage grabbing. If you're like me and you need to grab a lot of web pages and you pay for your bandwidth with a hosted provider, then you'll quickly go broke.

I'm using the built in inets module rather than the ibrowse module (which it seems everyone uses for historical reasons. inets' http client in the past I suppose was pretty lousy, but it seems ok to me now) because I can't get the raw binary data from ibrowse like I can from inets (if you need to use ibrowse for your project you can convert the response from ibrowse to binary and adapt this code and it should work fine, but this way should be a little more performant).

The parse and parse_http functions I found from some code that Joe Armstrong coded some time ago. The code only accepts gzip encoding, although it would be easy to make it accept deflate, but what web server nowadays actually serves a slower and less efficient compression when it can offer something much better :).


-define(USER_AGENT, "Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3").
-define(ACCEPT_ENCODING, "gzip").
-define(ACCEPT_CHARSET, "utf-8").
-define(ACCEPT, "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8").
-define(ACCEPT_LANGUAGE,"en-us,en;q=0.5").

-export([get_url/1]).

get_url(Url) ->
{_Http, Host, _Port, _File} = parse(Url),
{ok, {_StatusLine, Headers, Body}} = http:request(get,
{Url,
[{"Host", Host},
{"User-Agent", ?USER_AGENT},
{"Accept-Encoding", ?ACCEPT_ENCODING},
{"Accept-Charset", ?ACCEPT_CHARSET},
{"Accept", ?ACCEPT},
{"Accept-Language", ?ACCEPT_LANGUAGE}]},
[],
[{body_format, binary}]),
{utf8,get_body(Headers, Body)}.


get_body(Headers, Body) ->
case lists:keysearch("content-encoding", 1, Headers) of
{value, {Key, Value}} when Value =:= "gzip" -> zlib:gunzip(Body);
_ -> Body
end.

%%----------------------------------------------------------------------
%% parse(URL) -> {http, Site, Port, File} |
%% {file, File} | {error,Why}
%% (primitive)

parse([$h,$t,$t,$p,$:,$/,$/|T]) -> parse_http(T);
parse([$f,$t,$p,$:,$/,$/|_T]) -> {error, no_ftp};
parse([$f,$i,$l,$e,$:,$/,$/|F]) -> {file, F};
parse(_X) -> {error, unknown_url_type}.

parse_http(X) ->
case string:chr(X, $/) of
0 ->
%% not terminated by "/" (sigh)
%% try again
parse_http(X ++ "/");
N ->
%% The Host is up to the first "/"
%% The file is everything else
Host = string:substr(X, 1, N-1),
File = string:substr(X, N, length(X)),
%% Now check to see if the host name contains a colon
%% i.e. there is an explicit port address in the hostname
case string:chr(Host, $:) of
0 ->
%% no colon
Port = 80,
{http, Host, Port, File};
M ->
Site = string:substr(Host,1,M-1),
case (catch list_to_integer(
string:substr(Host, M+1, length(Host)))) of
{'EXIT', _} ->
{http, Site, 80, File};
Port ->
{http, Site, Port, File}
end
end
end.