美文网首页程序员
【26】使用webbroser和mshtml解析网页

【26】使用webbroser和mshtml解析网页

作者: 业余玩家 | 来源:发表于2018-08-09 23:59 被阅读25次

C#采集网页的方式:webclient,webbroser,httprequest,当然还有其他的方式,这次为了解析网页,主要使用了webbroser这种方式,这种方式也是比较简单的一种,但是感觉速度不行,可能是电脑或者网络的原因,其他的方式没有做尝试,下次再试试其他的两种。

首先,你需要在工具栏中找到webbroser这个控件,然后拖到窗口合适的位置,就可以使用它了。你可以加入一个网页地址输入栏,这样就可以获取到你输入地址的网页了。


2018-08-09_234743.png

webbroser.Navigate("你输入的url")将指定的文档加载到webbroser控件之中,执行方法之后你就能在控件中看到网页了,就相当于浏览器了,你还可以控制前进,后退,刷新网页,可以实现一个简单的浏览器了。

//开始请求网页,获取数据
private void Button_Click_2(object sender, RoutedEventArgs e)
        {
            if (datalist.Items.Count>0)
            {
                //foreach (Mydata item in datalist.Items) {
                //    GridViewColumn column = new GridViewColumn();
                //    column.Header = item.MYKEY;
                //    column.DisplayMemberBinding=
                //    resultgridview.Columns.Add(column);
                //}

                for (int i = 0; i < datalist.Items.Count; i++)
                {
                    Mydata md = datalist.Items[i] as Mydata;
                    resultgridview.Columns[i].Header = md.MYKEY;
                }
                if (siteurl.Text != string.Empty)
                {
                    //只是为了获取连接中的日期,并不通用
                    string[] arr = siteurl.Text.Split('/');
                    if (arr.Count() > 3)
                    {
                        string[] tmparr = arr[5].Split('.');
                        char[] tmpchar = tmparr[0].ToArray();
                        if (tmpchar.Count() == 6)
                        {
                            datetime = "20" + tmpchar[0] + tmpchar[1] + "/" + tmpchar[2] + tmpchar[3] + "/" + tmpchar[4] + tmpchar[5];
                        }
                    }
                    //将获取的文档加载到webbroser控件之中
                    webbroser.Navigate(siteurl.Text);
                    //防止重复获取数据
                    webbroser.LoadCompleted -= Broserfinished;
                    webbroser.LoadCompleted += Broserfinished;
                }
            }


            
        }

webbroser.LoadCompleted+=执行的方法,网页加载完毕之后执行的方法,方便抓取网页的所有内容。

List<dataitems> myld = new List<dataitems>();
        private void Broserfinished(object sender, NavigationEventArgs e)
        {
            var document = this.webbroser.Document as HTMLDocument;
            //可根据id,tagname获取元素,和js一样。
            var items= document.getElementsByTagName("div");

            //这里根据classname获取值,并加入listview里面,这里写的不好,只是为了实现当时的需求,待完善。
            List<dataitems> ld = new List<dataitems>();
            dataitems dt = new dataitems();
            foreach (IHTMLElement item in items)
            {
                for (int i = 0; i < datalist.Items.Count; i++)
                {
                    Mydata md = datalist.Items[i] as Mydata;
                    if (item.innerText != null)
                    {
                        //目的是去掉不合规范的项
                        if (item.innerText.Contains("(") || !isint(item.innerText))
                        {
                            continue;
                        }
                        if (i == 0 && item.className == md.MYVALUE)
                        {
                            dt.ITEM1 = item.innerText;
                        }
                        if (i == 1 && item.className == md.MYVALUE)
                       { 
                            dt.ITEM2 = item.innerText;
                            dt.ITEM3 = datetime;
                            ld.Add(dt);
                            dt = new dataitems();
                        }
                    }
                }
            }
            //目的是去除不和规范的项
            ld=ld.Where(X => X.ITEM1!=null).ToList();

            foreach (dataitems tdt in ld)
            {
                resultdatalist.Items.Add(tdt);
            }
        }

        private bool isint(string value)
        {
            return Regex.IsMatch(value, @"[0-9]");
        }

MSHTML是微软公司的一个COM组件,该组件封装了HTML语言中的所有元素及其属性,通过其提供的标准接口,可以访问指定网页的所有元素。要使用它首先需要添加其引用,vs2017在程序集-扩展里面可以找到。


2018-08-10_001627.png

完整代码,仅供参考,没有写的很通用

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Windows;
using System.Windows.Controls;
using System.Windows.Data;
using System.Windows.Documents;
using System.Windows.Input;
using System.Windows.Media;
using System.Windows.Media.Imaging;
using System.Windows.Navigation;
using System.Windows.Shapes;
using Microsoft.Office.Interop.Excel;
using mshtml;


namespace GetData
{
    /// <summary>
    /// MainWindow.xaml 的交互逻辑
    /// </summary>
    public partial class MainWindow : System.Windows.Window
    {
        public class Mydata
        {
            private string mykey;
            private string myvalue;

            public string MYKEY
            {
                get
                {
                    return mykey;
                }
                set
                {
                    mykey = value;
                }
            }

            public string MYVALUE
            {
                get
                {
                    return myvalue;
                }
                set
                {
                    myvalue = value;
                }
            }
        }

        public class dataitems
        {
            private string item1;
            private string item2;
            private string item3;
            private string item4;
            private string item5;
            
            public string ITEM1
            {
                get
                {
                    return item1;
                }
                set
                {
                    item1 = value;
                }
            }
            public string ITEM2
            {
                get
                {
                    return item2;
                }
                set
                {
                    item2 = value;
                }
            }
            public string ITEM3
            {
                get
                {
                    return item3;
                }
                set
                {
                    item3 = value;
                }
            }
            public string ITEM4
            {
                get
                {
                    return item4;
                }
                set
                {
                    item4 = value;
                }
            }
            public string ITEM5
            {
                get
                {
                    return item5;
                }
                set
                {
                    item5 = value;
                }
            }

        }




        public MainWindow()
        {
            InitializeComponent();
        }

        /// <summary>
        /// 添加抓取字段
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void Button_Click(object sender, RoutedEventArgs e)
        {
            if (selfname.Text != string.Empty && selfvalue.Text != string.Empty)
            {
                Mydata mydata = new Mydata();

                mydata.MYKEY = selfname.Text;
                mydata.MYVALUE = selfvalue.Text;

                datalist.Items.Add(mydata);

                selfname.Text = "";
                selfvalue.Text = "";
            }
            else
            {
                MessageBox.Show("字段和规则不能为空");
            }
        }

        /// <summary>
        /// 删除字段
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void Button_Click_1(object sender, RoutedEventArgs e)
        {
            if (null != datalist.SelectedItem)
            {
                datalist.Items.Remove(datalist.SelectedItem);
            }
        }

        string datetime = string.Empty;
        private void Button_Click_2(object sender, RoutedEventArgs e)
        {
            if (datalist.Items.Count>0)
            {
                //foreach (Mydata item in datalist.Items) {
                //    GridViewColumn column = new GridViewColumn();
                //    column.Header = item.MYKEY;
                //    column.DisplayMemberBinding=
                //    resultgridview.Columns.Add(column);
                //}

                for (int i = 0; i < datalist.Items.Count; i++)
                {
                    Mydata md = datalist.Items[i] as Mydata;
                    resultgridview.Columns[i].Header = md.MYKEY;
                }
                if (siteurl.Text != string.Empty)
                {
                    string[] arr = siteurl.Text.Split('/');
                    if (arr.Count() > 3)
                    {
                        string[] tmparr = arr[5].Split('.');
                        char[] tmpchar = tmparr[0].ToArray();
                        if (tmpchar.Count() == 6)
                        {
                            datetime = "20" + tmpchar[0] + tmpchar[1] + "/" + tmpchar[2] + tmpchar[3] + "/" + tmpchar[4] + tmpchar[5];
                        }
                    }
                    webbroser.Navigate(siteurl.Text);
                    webbroser.LoadCompleted -= Broserfinished;
                    webbroser.LoadCompleted += Broserfinished;


                }
            }


            
        }

        List<dataitems> myld = new List<dataitems>();
        private void Broserfinished(object sender, NavigationEventArgs e)
        {
            var document = this.webbroser.Document as HTMLDocument;
            var items= document.getElementsByTagName("div");


            List<dataitems> ld = new List<dataitems>();
            dataitems dt = new dataitems();
            foreach (IHTMLElement item in items)
            {
                for (int i = 0; i < datalist.Items.Count; i++)
                {
                    Mydata md = datalist.Items[i] as Mydata;
                    if (item.innerText != null)
                    {
                        if (item.innerText.Contains("(") || !isint(item.innerText))
                        {
                            continue;
                        }
                        if (i == 0 && item.className == md.MYVALUE)
                        {
                            dt.ITEM1 = item.innerText;
                        }
                        if (i == 1 && item.className == md.MYVALUE)
                        {
                            
                            dt.ITEM2 = item.innerText;
                            dt.ITEM3 = datetime;
                            ld.Add(dt);
                            dt = new dataitems();
                        }
                    }
                }
            }

            ld=ld.Where(X => X.ITEM1!=null).ToList();

            foreach (dataitems tdt in ld)
            {
                resultdatalist.Items.Add(tdt);
            }
            
            


            //mshtml.HTMLDocument dom =(mshtml.HTMLDocument)webbroser.Document;
            //IHTMLDocument2 dom2 = (IHTMLDocument2)webbroser.DocumentText;

            //foreach (IHTMLElement item in dom2.all)
            //{
            //    if (item.className == selfvalue.Text)
            //    {
            //        MessageBox.Show(item.innerText);
            //    }
            //}
        }

        private void export_click(object sender, RoutedEventArgs e)
        {
            if (resultdatalist.SelectedItems.Count > 0)
            {
                DateTime dtime =Convert.ToDateTime("2017-03-27");
                System.Windows.Forms.SaveFileDialog sfd = new System.Windows.Forms.SaveFileDialog();
                sfd.DefaultExt = "csv";
                sfd.Filter = "Excel文件(*.csv)|*.csv";
                sfd.RestoreDirectory = true;
                sfd.CreatePrompt = false;
                sfd.Title = "导出文件到";
                sfd.ShowDialog();
                string fileName = sfd.FileName;

                Microsoft.Office.Interop.Excel.Application app = new Microsoft.Office.Interop.Excel.Application();
                Workbook wk = app.Workbooks.Add(System.Type.Missing);
                for (int i = 1; i <=resultdatalist.SelectedItems.Count; i++)
                {
                    
                    dataitems dt = resultdatalist.SelectedItems[i-1] as dataitems;
                    int ColumnIndex = 1;
                    app.Cells[i, ColumnIndex++] = dt.ITEM1;
                    app.Cells[i, ColumnIndex++] = dt.ITEM2;
                    app.Cells[i, ColumnIndex++] = dt.ITEM3; 
                        
                }
                wk.SaveAs(fileName);  //将其进行保存到指定的路径  
                wk.Close();
            }

            else
            {
                MessageBox.Show("请选择导出内容");
            }
        }

        private bool isint(string value)
        {
            return Regex.IsMatch(value, @"[0-9]");
        }
    }
}

相关文章

网友评论

    本文标题:【26】使用webbroser和mshtml解析网页

    本文链接:https://www.haomeiwen.com/subject/httkeftx.html