今天因為工作需要我要利用.net來寫一個采集功能出一, 實現(xiàn)要求是:采集指定網(wǎng)站頁面中所有圖片并且可以過濾不符合要求的圖片并保存到本地,同時還需要把標題與內(nèi)容采集下來。
原理很簡單:抓取網(wǎng)頁源碼=>正則匹配圖片地址=>使用C#提供的方法下載圖片
如果是縮略圖形式,再獲取A標簽的HREF地址,過濾出圖片地址即可。
代碼如下 | 復(fù)制代碼 |
static string GetImgUrl(string html) { //string regex = @"<img.*?src=[""''](.+?)[""''].*?>";//這個不可以匹配沒有單引號和雙引號的情況 string regex = @"<img.*?src=[''""]?([^""''s]+)"; //string regex=@"(?is)<img[sS]*?src=[''""]?([^''""s]+)"; StringBuilder builder = new StringBuilder(); MatchCollection mc = Regex.Matches(html, regex, RegexOptions.IgnoreCase | RegexOptions.Multiline); for (int i = 0; i < mc.Count; i++) { builder.Append(mc[i].Groups[1].Value); builder.Append("@"); } return builder.ToString(); } static void Main(string[] args) { string html = GetHtmlInfo("www.baidu.com", 15000, Encoding.GetEncoding("GBK")); Console.WriteLine(html); //html = "<dsd<img alt=''www '' src=0.jpg />ssd<img alt=''dff '' src=''1.jpg'' title=''AA''><img src=''http://filesimg.111cn.net/2013/11/07/20131107021918325.jpg''>dsds<img src="3.png" /> fd<img src=''4.jpg''><img title=''dsds'' src=''5.jpg''>"; //Console.WriteLine(GetImgUrl(html)); string[] imgarr = GetImgUrl(html).TrimEnd(''@'').Split(''@'');//去掉最后一個@符號,再分割成數(shù)組 foreach (string str in imgarr) { SaveImg(str); Console.WriteLine(str); } Console.Read(); } /// <summary> /// 下載指定頁面所有圖片 /// </summary> /// <param name="imgurl"></param> static void SaveImg(string imgurl) { string imgName = imgurl.Substring(imgurl.LastIndexOf(''/''));//獲取原來圖片名稱 WebRequest request = WebRequest.Create(imgurl); WebResponse response = request.GetResponse(); Stream reader = response.GetResponseStream(); if (!Directory.Exists(@"D:tony")) { Directory.CreateDirectory(@"D:Tony"); } FileStream writer = new FileStream(@"D:Tony" + imgName, FileMode.OpenOrCreate, FileAccess.Write); byte[] buff = new byte[512]; int c = 0; while ((c = reader.Read(buff, 0, buff.Length)) > 0) { writer.Write(buff, 0, c); } //此處可以過濾圖片尺寸 using (Image img = Image.FromFile(@"D:Tony" + imgName + ".jpg")) { if (img.Size.Width > 100) { Console.WriteLine(img.Size); } } writer.Close(); reader.Close(); response.Close(); } /// <summary> /// 獲取頁面的HTML信息 /// </summary> /// <param name="url">頁面地址</param> /// <param name="timeout">超時時間,單位:ms</param> /// <param name="EnCodeType">編碼</param> /// <returns></returns> static string GetHtmlInfo(string url, int timeout, Encoding EnCodeType) { if (!url.StartsWith("http://") && !url.StartsWith("https://")) { url = "http://" + url; } string result = ""; System.IO.StreamReader reader = null; string temp = ""; try { HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);//初始化WebRequest request.Timeout = timeout; request.UserAgent = "User-Agent:Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 2.0.40607; .NET CLR 1.1.4322; .NET CLR 3.5.30729)"; request.Accept = "*/*"; request.KeepAlive = true; request.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5"); HttpWebResponse response = (HttpWebResponse)request.GetResponse();//返回來自Internet的響應(yīng) if (response.StatusCode == System.Net.HttpStatusCode.OK) { StringBuilder builder = new StringBuilder(); Stream stream = response.GetResponseStream(); reader = new StreamReader(stream, EnCodeType); string tmp = ""; while ((temp = reader.ReadLine()) != null) { builder.Append(temp); tmp = builder.ToString(); builder.Append("rn"); } result = builder.ToString(); return result; } return string.Empty; } catch (Exception ex) { return ex.Message; } finally { if (reader != null) { reader.Close(); } } } |
C#快速獲取網(wǎng)頁頁面的標題
代碼如下 | 復(fù)制代碼 |
using System.Text.RegularExpressions; static string GetTitle(string html) { //string regex = @"(<title>)([sS]*)(</title>)"; string regex = @"(?<=<title.*>)([sS]*)(?=</title>)"; //正向預(yù)搜索與反向預(yù)搜索:http://www.rczjp.cn/HTML/120709/20120409090416.html Regex reg = new Regex(regex, RegexOptions.IgnoreCase); return reg.Match(html).Value.Trim(); } static void Main(string[] args) { string html = GetHtmlInfo("www.rczjp.cn", 5000, Encoding.UTF8); Console.WriteLine(html); Console.WriteLine(GetTitle(html)); Console.Read(); } /// <summary> /// 獲取頁面的HTML信息,到標題(</title>)位置結(jié)束 /// </summary> /// <param name="url">頁面地址</param> /// <param name="timeout">超時時間,單位:ms</param> /// <param name="EnCodeType">編碼</param> /// <returns></returns> static string GetHtmlInfo(string url, int timeout, Encoding EnCodeType) { if (!url.StartsWith("http://") && !url.StartsWith("https://")) { url = "http://" + url; } string result = ""; System.IO.StreamReader reader = null; string temp = ""; try { HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);//初始化WebRequest request.Timeout = timeout; request.UserAgent = "User-Agent:Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 2.0.40607; .NET CLR 1.1.4322; .NET CLR 3.5.30729)"; request.Accept = "*/*"; request.KeepAlive = true; request.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5"); HttpWebResponse response = (HttpWebResponse)request.GetResponse();//返回來自Internet的響應(yīng) if (response.StatusCode == System.Net.HttpStatusCode.OK) { StringBuilder builder = new StringBuilder(); Stream stream = response.GetResponseStream(); reader = new StreamReader(stream, EnCodeType); string tmp = ""; while ((temp = reader.ReadLine()) != null) { builder.Append(temp); tmp = builder.ToString(); if (tmp.IndexOf("</title>") > 0) { break; }//ReadLine是讀取整行,所以有時在它后面的很多字符串也會讀取 Console.WriteLine(tmp.IndexOf("</title>")); builder.Append("rn"); } result = builder.ToString(); return result; } return string.Empty; } catch (Exception ex) { return ex.Message; } finally { if (reader != null) { reader.Close(); } } } |
該文章在 2017/4/5 0:40:56 編輯過