CSharp抓取HTML网页内容

    using mshtml;
    using HtmlAgilityPack;

    class HTMLCrawler
    {
        private PhaseResultBean PhaseHtml(int index, Uri uri, String szResultPath, String szErrorPath, HTMLEnginType htmlEngin)
        {
            PhaseResultBean result = new PhaseResultBean();
            try
            {
                WebClient client = new WebClient();
                client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
                Byte[] pageData = client.DownloadData(uri);
                string pageHtml = Encoding.UTF8.GetString(pageData);
                if (checkSavePages.Checked)
                {
                    String szHtmlPath = XWin32.getExeParentPath() + index.ToString()+".html";
                    using (StreamWriter sw = new StreamWriter(szHtmlPath, true))
                    {
                        sw.WriteLine(pageHtml);
                    }
                }

                switch(htmlEngin)
                {
                    case HTMLEnginType.HTMLEngin_mshtml:
                        PhaseHtml_mshtml(pageHtml, szResultPath, szErrorPath, result);
                        break;
                    case HTMLEnginType.HTMLEngin_HtmlAgilityPack:
                        PhaseHtml_HtmlAgilityPack(pageHtml, szResultPath, szErrorPath, result);
                        break;
                }
            }
            catch (WebException webEx)
            {
                using (StreamWriter sw = new StreamWriter(szErrorPath, true))
                {
                    sw.WriteLine(webEx.Message);
                }
                result.bSuccess = false;
            }

            return result;
        }

        private void PhaseHtml_mshtml(String pageHtml, String szResultPath, String szErrorPath, PhaseResultBean result)
        {
            mshtml.HTMLDocument docObject = new mshtml.HTMLDocument();
            mshtml.IHTMLDocument2 doc2 = docObject as mshtml.IHTMLDocument2;
            doc2.write(pageHtml);
            doc2.close();

            mshtml.IHTMLDocument3 doc3 = docObject as mshtml.IHTMLDocument3;

            int len = doc3.getElementById("shop-all-list").children[0].children.length;
            result.total += len;

            foreach (IHTMLElement li in doc3.getElementById("shop-all-list").children[0].children)
            {
                try
                {
                    IHTMLElement title = li.children[1].children[0];
                    String szTitle = title.innerText;
                    if (szTitle != null) szTitle = szTitle.Replace("\r\n", "-");
                    IHTMLElement star = li.children[1].children[1].children[0];
                    String szStar = star.getAttribute("title");
                    IHTMLElement reviewNum = li.children[1].children[1].children[1];
                    String szReviewNum = reviewNum.innerText;
                    IHTMLElement meanPrice = li.children[1].children[1].children[3];
                    String szMeanPrice = meanPrice.innerText;
                    IHTMLElement category = li.children[1].children[2].children[0];
                    String szCategory = category.innerText;
                    IHTMLElement address = li.children[1].children[2].children[3];
                    String szAddress = address.innerText;
                    if (szAddress != null) szAddress.Replace(",", "-");

                    IHTMLElement taste = li.children[1].children[3].children[0];
                    String szTaste = taste.innerText;
                    IHTMLElement evn = li.children[1].children[3].children[1];
                    String szEvn = evn.innerText;
                    IHTMLElement service = li.children[1].children[3].children[2];
                    String szService = service.innerText;

                    //将获取的内容写入文本
                    using (StreamWriter sw = new StreamWriter(szResultPath, true))
                    {
                        sw.WriteLine(szTitle + "," + szStar + "," + szReviewNum + "," + szMeanPrice + "," + szCategory + "," + szAddress + "," + szTaste + "," + szEvn + "," + szService);
                    }
                }
                catch (Exception Ex)
                {
                    using (StreamWriter sw = new StreamWriter(szErrorPath, true))
                    {
                        sw.WriteLine(Ex.Message);
                    }

                    result.failed += 1;
                }

            }
        }

        private void PhaseHtml_HtmlAgilityPack(String pageHtml, String szResultPath, String szErrorPath, PhaseResultBean result)
        {
            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
            doc.LoadHtml(pageHtml);

            HtmlAgilityPack.HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("/html[1]/body[1]/div[4]/div[3]/div[1]/div[1]/div[2]/ul[1]/li");
            result.total += nodes.Count;

            foreach (HtmlAgilityPack.HtmlNode li in nodes)
            {
                try
                {
                    HtmlAgilityPack.HtmlNode titleA = li.SelectNodes("div[2]/div[1]/a[1]")[0];
                    HtmlAgilityPack.HtmlNode titleB = li.SelectNodes("div[2]/div[1]/a[2]") == null ? null : li.SelectNodes("div[2]/div[1]/a[2]")[0];
                    String szTitle = (titleA==null?"":titleA.InnerText) + "-" + (titleB == null ? "" : titleB.InnerText);
                    if (szTitle != null) szTitle = szTitle.Replace("\n", "");
                    if (szTitle != null) szTitle = szTitle.Replace(" ", "");

                    HtmlAgilityPack.HtmlNode star = li.SelectNodes("div[2]/div[2]/span[1]")[0];
                    String szStar = star.Attributes["title"].Value.ToString();

                    HtmlAgilityPack.HtmlNode reviewNum = li.SelectNodes("div[2]/div[2]/a[1]")[0];
                    String szReviewNum = reviewNum.InnerText;
                    if (szReviewNum != null) szReviewNum = szReviewNum.Replace("\n", "");
                    if (szReviewNum != null) szReviewNum = szReviewNum.Replace(" ", "");

                    HtmlAgilityPack.HtmlNode meanPrice = li.SelectNodes("div[2]/div[2]/a[2]")[0];
                    String szMeanPrice = meanPrice.InnerText;
                    if (szMeanPrice != null) szMeanPrice = szMeanPrice.Replace("\n", "");
                    if (szMeanPrice != null) szMeanPrice = szMeanPrice.Replace(" ", "");

                    HtmlAgilityPack.HtmlNode category = li.SelectNodes("div[2]/div[3]/a[1]")[0];
                    String szCategory = category.InnerText;

                    HtmlAgilityPack.HtmlNode addressA = li.SelectNodes("div[2]/div[3]/a[2]")[0];
                    HtmlAgilityPack.HtmlNode addressB = li.SelectNodes("div[2]/div[3]/span[1]")[0];
                    String szAddress = addressA.InnerText + "-" + addressB.InnerText;
                    if (szAddress != null) szAddress.Replace(",", "-");

                    HtmlAgilityPack.HtmlNode taste = li.SelectNodes("div[2]/span[1]/span[1]")[0];
                    String szTaste = taste.InnerText;
                    HtmlAgilityPack.HtmlNode evn = li.SelectNodes("div[2]/span[1]/span[2]")[0];
                    String szEvn = evn.InnerText;
                    HtmlAgilityPack.HtmlNode service = li.SelectNodes("div[2]/span[1]/span[3]")[0];
                    String szService = service.InnerText;

                    //将获取的内容写入文本
                    using (StreamWriter sw = new StreamWriter(szResultPath, true))
                    {
                        sw.WriteLine(szTitle + "," + szStar + "," + szReviewNum + "," + szMeanPrice + "," + szCategory + "," + szAddress + "," + szTaste + "," + szEvn + "," + szService);
                    }
                }
                catch (Exception Ex)
                {
                    using (StreamWriter sw = new StreamWriter(szErrorPath, true))
                    {
                        sw.WriteLine(Ex.Message);
                    }

                    result.failed += 1;
                }

            }
        }
    }

    class PhaseResultBean
    {
        public Boolean bSuccess;
        public int total;
        public int successed;
        public int failed;
    }

    public enum HTMLEnginType
    {
        HTMLEngin_mshtml,
        HTMLEngin_HtmlAgilityPack
    }

Comments are closed.