c# 爬虫(三) 文件上传

在上一篇中,我们说了模拟登录

下面我们说说附件上传。

据说,最早的http协议是不支持附件上传的,后来有添加了一个RFC 2045 协议,才支持附件上传,关于附件上传,请参见

http://www.cnblogs.com/greenerycn/archive/2010/05/15/csharp_http_post.html

 

好了,其实用C#模拟上传附件,主要的难点就在于如何构建此协议的格式以及编码。

作为备忘,我这里只放一段成品代码了。

参数:URL,上传控件的id,附件的本地全路径,传输的Key-value数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
public
string
HttpWebUpload(
string
url,
string
fileControlID,
string
fileFullPath, Dictionary<
string
,
string
> dic)
        
{
            
string
fileContentType =
"application/octet-stream"
;
//"application/octet-stream";//"text/plain";//image/jpeg
          
            
int
index = fileFullPath.LastIndexOf(
‘\\‘
);
            
string
filename = fileFullPath.Substring(index+1);
             
            
var
boundary = DateTime.Now.Ticks.ToString(
"x"
);
            
var
beginBoundary =
"--"
+ boundary;
            
var
reqContentType =
"multipart/form-data; boundary="
+ boundary;
 
            
var
memStream =
new
MemoryStream();
            
var
fileStream =
new
FileStream(fileFullPath, FileMode.Open, FileAccess.Read);
            
int
fsLen = (
int
)fileStream.Length;
            
byte
[] btfileValue =
new
byte
[fsLen];
            
int
r = fileStream.Read(btfileValue, 0, btfileValue.Length);
 
            
StringBuilder fileHeader =
new
StringBuilder();
            
fileHeader.AppendLine(beginBoundary);
            
fileHeader.AppendLine(
string
.Format(
"Content-Disposition: form-data; name=\"{0}\"; filename=\"{1}\""
, fileControlID, filename));
            
fileHeader.AppendLine(
string
.Format(
"Content-Type: {0}"
, fileContentType));
            
fileHeader.AppendLine();
            
var
btfileKey = Encoding.UTF8.GetBytes(fileHeader.ToString());
 
            
StringBuilder dicData =
new
StringBuilder();
            
dicData.AppendLine();
            
foreach
(
var
item
in
dic)
            
{
 
                
dicData.AppendLine(beginBoundary);
                
dicData.AppendLine(
string
.Format(
"Content-Disposition:form-data; name=\"{0}\""
, item.Key));
                
dicData.AppendLine();
                
dicData.AppendLine(item.Value);
            
}
 
            
dicData.AppendLine(beginBoundary +
"--"
);
            
var
btDic = Encoding.UTF8.GetBytes(dicData.ToString());
 
            
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
            
req.Method =
"POST"
;
            
req.AllowAutoRedirect =
false
;
            
req.ContentType = reqContentType;
            
if
(!
string
.IsNullOrEmpty(UserAgent))
            
{
                
req.UserAgent =
this
.UserAgent;
            
}
 
            
if
(!
string
.IsNullOrEmpty(Cookie))
            
{
                
req.CookieContainer =
new
CookieContainer();
                
req.CookieContainer.SetCookies(req.RequestUri,
this
.Cookie);
            
}
 
            
req.ContentLength = btfileKey.Length + btfileValue.Length + btDic.Length;
            
Stream postDataStream = req.GetRequestStream();
            
postDataStream.Write(btfileKey, 0, btfileKey.Length);
            
postDataStream.Write(btfileValue, 0, btfileValue.Length);
            
postDataStream.Write(btDic, 0, btDic.Length);
            
postDataStream.Close();
 
            
HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
            
string
html =
new
StreamReader(resp.GetResponseStream()).ReadToEnd();
            
return
html;
        
}

  注意,这里如果需要cookie,则添加对应的cookie。

调用方法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
string
controlid =
"fupCert"
;
string
filefullPath =
"f:\\3.pdf"
;
string
url =
"http://10.35.3.240/NT_HandbookExtend/FrmCertificate.aspx?type=N&zdtype=J&DELEGATENO=SBWJ16100001&ID=SB20161018000001%2c3206960757&Action=Modify&manualRecordID=SL20161018000001&tm=0.17032586549994377"
;
 
var
hiddenVal = web.GetDoNetHiddenValuesByUrl(url);
//获取__VIEWSTATE,与__EVENTVALIDATION隐藏域值
 
Dictionary<
string
,
string
> dic =
new
Dictionary<
string
,
string
>();
dic.Add(
"__EVENTARGUMENT"
,
""
);
dic.Add(
"__EVENTTARGET"
,
"btnUpload"
);
dic.Add(
"__EVENTVALIDATION"
, hiddenVal.Eventvalidation);
dic.Add(
"__PREVIOUSPAGE"
,
"EncayjCF95BJXxMazWGgd9UdPYeLp64GjuJlZ-rvnQ5n34-y7KQYllE35nAdVFvk0"
);
dic.Add(
"__VIEWSTATE"
, hiddenVal.Vievstate);
dic.Add(
"__VIEWSTATEENCRYPTED"
,
""
);
dic.Add(
"grdNavigatorPRE_EMS3_CUS_IMG$ctl06"
,
""
);
dic.Add(
"grdNavigatorPRE_EMS3_CUS_IMG$ctl07"
,
"15"
);
dic.Add(
"grdNavigatorPRE_EMS3_CUS_IMG$labelRowCount"
,
"8"
);
dic.Add(
"hfERP"
,
"http://www.nteport.gov.cn/cerp/platform/erp/documentManagement/documentDocking.jsp"
);
dic.Add(
"hidFileType"
,
"333"
);
dic.Add(
"hidID"
,
""
);
//web.UserAgent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)";
string
html = web.HttpWebUpload(url, controlid, filefullPath, dic);

  

其他注意事项:

1.如果不限制文件类型,请使用 ContentType = "application/octet-stream";这个我也没怎么懂,尴尬。。。

2. 编码很重要,由于传参时,都是以流的形式传参,所以文件编码使用默认的编码方式,和其他构建的RFC 2045 协议格式,使用UTF-8编码。

相关文章