I am having trouble with a C# proxy page I am writing which allows me to do cross domain AJAX calls with Javascript.
The problem is with certain pages that contain pound signs £ that are not HTML encoded in the source that I am trying
to extract with the WebRequest and WebResponse objects.
The page is using a charset of iso-8859-1 which I think is the problem as my object is using UTF-8. I have created
two test pages one using UTF-8 the other iso-8859-1
http://www.strictly-software.com/test_pound_iso. htm
http://www.strictly-software.com/test_pound_utf8 .htm
I can extract the unencoded pound signs successfully from the UTF-8 page but not the ISO page. Obviously I have no control
over the source content that I want to extract so is there a way of getting the pound signs back instead of ? or squares?
I have tried numerous methods e.g byte arrays, memory streams, changing the encoding object to ASCII, supplying content-types
and charsets etc but nothing seems to work.
The code is below
using System;
using System.Collecti ons.Generic;
using System.Text;
using System.Net;
using System.IO;
namespace HattrickHeaven
{
public class HTTPRequest
{
private int _status = 0;
private string _statusDesc = "";
private string _responseConten t = "";
private string _errorType = "";
private bool _retry = false;
// only here for testing
private bool _debug = true;
private string _debugFile = @"d:\inetpub\ww wroot\hattrickh eaven.com\LogFi les\ripperDebug .txt";
private void ShowDebug(strin g msg)
{
if(!_debug) return;
if(!String.IsNu llOrEmpty(msg))
{
msg += " ";
System.IO.File. AppendAllText(_ debugFile,msg,E ncoding.UTF8);
// System.IO.File. AppendAllText(_ debugFile, msg, Encoding.ASCII) ;
}
}
public HTTPRequest(str ing URL, WebProxy proxy, string robotAgent)
{
HttpWebRequest client = (HttpWebRequest )WebRequest.Cre ate(URL);
client.Method = "GET";
client.ContentL ength = 0;
client.ContentT ype = "applicatio n/x-www-form-urlencoded;char set=charset=UTF-8";
if (proxy != null)
{
client.Proxy = proxy;
}
client.Timeout = 40000;
client.UserAgen t = robotAgent;
try
{
// get the response
HttpWebResponse response = (HttpWebRespons e)client.GetRes ponse();
_statusDesc = response.Status Description;
_status = Convert.ToInt32 (response.Statu sCode);
StreamReader ResponseStream = new StreamReader(re sponse.GetRespo nseStream(), Encoding.UTF8);
_responseConten t = ResponseStream. ReadToEnd();
ResponseStream. Close();
response.Close( );
}
catch (Exception err)
{
_errorType = err.Message.ToS tring();
}
}
public int StatusCode
{
get { return _status; }
}
public string StatusDesc
{
get { return _statusDesc; }
}
public string Response
{
get { return _responseConten t; }
}
public string ErrorType
{
get { return _errorType; }
}
public bool Retry
{
get { return _retry; }
}
}
}
Thanks for any help
The problem is with certain pages that contain pound signs £ that are not HTML encoded in the source that I am trying
to extract with the WebRequest and WebResponse objects.
The page is using a charset of iso-8859-1 which I think is the problem as my object is using UTF-8. I have created
two test pages one using UTF-8 the other iso-8859-1
http://www.strictly-software.com/test_pound_iso. htm
http://www.strictly-software.com/test_pound_utf8 .htm
I can extract the unencoded pound signs successfully from the UTF-8 page but not the ISO page. Obviously I have no control
over the source content that I want to extract so is there a way of getting the pound signs back instead of ? or squares?
I have tried numerous methods e.g byte arrays, memory streams, changing the encoding object to ASCII, supplying content-types
and charsets etc but nothing seems to work.
The code is below
using System;
using System.Collecti ons.Generic;
using System.Text;
using System.Net;
using System.IO;
namespace HattrickHeaven
{
public class HTTPRequest
{
private int _status = 0;
private string _statusDesc = "";
private string _responseConten t = "";
private string _errorType = "";
private bool _retry = false;
// only here for testing
private bool _debug = true;
private string _debugFile = @"d:\inetpub\ww wroot\hattrickh eaven.com\LogFi les\ripperDebug .txt";
private void ShowDebug(strin g msg)
{
if(!_debug) return;
if(!String.IsNu llOrEmpty(msg))
{
msg += " ";
System.IO.File. AppendAllText(_ debugFile,msg,E ncoding.UTF8);
// System.IO.File. AppendAllText(_ debugFile, msg, Encoding.ASCII) ;
}
}
public HTTPRequest(str ing URL, WebProxy proxy, string robotAgent)
{
HttpWebRequest client = (HttpWebRequest )WebRequest.Cre ate(URL);
client.Method = "GET";
client.ContentL ength = 0;
client.ContentT ype = "applicatio n/x-www-form-urlencoded;char set=charset=UTF-8";
if (proxy != null)
{
client.Proxy = proxy;
}
client.Timeout = 40000;
client.UserAgen t = robotAgent;
try
{
// get the response
HttpWebResponse response = (HttpWebRespons e)client.GetRes ponse();
_statusDesc = response.Status Description;
_status = Convert.ToInt32 (response.Statu sCode);
StreamReader ResponseStream = new StreamReader(re sponse.GetRespo nseStream(), Encoding.UTF8);
_responseConten t = ResponseStream. ReadToEnd();
ResponseStream. Close();
response.Close( );
}
catch (Exception err)
{
_errorType = err.Message.ToS tring();
}
}
public int StatusCode
{
get { return _status; }
}
public string StatusDesc
{
get { return _statusDesc; }
}
public string Response
{
get { return _responseConten t; }
}
public string ErrorType
{
get { return _errorType; }
}
public bool Retry
{
get { return _retry; }
}
}
}
Thanks for any help