美文网首页程序员
【26】使用webbroser和mshtml解析网页

【26】使用webbroser和mshtml解析网页

作者: 业余玩家 | 来源:发表于2018-08-09 23:59 被阅读25次

    C#采集网页的方式:webclient,webbroser,httprequest,当然还有其他的方式,这次为了解析网页,主要使用了webbroser这种方式,这种方式也是比较简单的一种,但是感觉速度不行,可能是电脑或者网络的原因,其他的方式没有做尝试,下次再试试其他的两种。

    首先,你需要在工具栏中找到webbroser这个控件,然后拖到窗口合适的位置,就可以使用它了。你可以加入一个网页地址输入栏,这样就可以获取到你输入地址的网页了。


    2018-08-09_234743.png

    webbroser.Navigate("你输入的url")将指定的文档加载到webbroser控件之中,执行方法之后你就能在控件中看到网页了,就相当于浏览器了,你还可以控制前进,后退,刷新网页,可以实现一个简单的浏览器了。

    //开始请求网页,获取数据
    private void Button_Click_2(object sender, RoutedEventArgs e)
            {
                if (datalist.Items.Count>0)
                {
                    //foreach (Mydata item in datalist.Items) {
                    //    GridViewColumn column = new GridViewColumn();
                    //    column.Header = item.MYKEY;
                    //    column.DisplayMemberBinding=
                    //    resultgridview.Columns.Add(column);
                    //}
    
                    for (int i = 0; i < datalist.Items.Count; i++)
                    {
                        Mydata md = datalist.Items[i] as Mydata;
                        resultgridview.Columns[i].Header = md.MYKEY;
                    }
                    if (siteurl.Text != string.Empty)
                    {
                        //只是为了获取连接中的日期,并不通用
                        string[] arr = siteurl.Text.Split('/');
                        if (arr.Count() > 3)
                        {
                            string[] tmparr = arr[5].Split('.');
                            char[] tmpchar = tmparr[0].ToArray();
                            if (tmpchar.Count() == 6)
                            {
                                datetime = "20" + tmpchar[0] + tmpchar[1] + "/" + tmpchar[2] + tmpchar[3] + "/" + tmpchar[4] + tmpchar[5];
                            }
                        }
                        //将获取的文档加载到webbroser控件之中
                        webbroser.Navigate(siteurl.Text);
                        //防止重复获取数据
                        webbroser.LoadCompleted -= Broserfinished;
                        webbroser.LoadCompleted += Broserfinished;
                    }
                }
    
    
                
            }
    

    webbroser.LoadCompleted+=执行的方法,网页加载完毕之后执行的方法,方便抓取网页的所有内容。

    List<dataitems> myld = new List<dataitems>();
            private void Broserfinished(object sender, NavigationEventArgs e)
            {
                var document = this.webbroser.Document as HTMLDocument;
                //可根据id,tagname获取元素,和js一样。
                var items= document.getElementsByTagName("div");
    
                //这里根据classname获取值,并加入listview里面,这里写的不好,只是为了实现当时的需求,待完善。
                List<dataitems> ld = new List<dataitems>();
                dataitems dt = new dataitems();
                foreach (IHTMLElement item in items)
                {
                    for (int i = 0; i < datalist.Items.Count; i++)
                    {
                        Mydata md = datalist.Items[i] as Mydata;
                        if (item.innerText != null)
                        {
                            //目的是去掉不合规范的项
                            if (item.innerText.Contains("(") || !isint(item.innerText))
                            {
                                continue;
                            }
                            if (i == 0 && item.className == md.MYVALUE)
                            {
                                dt.ITEM1 = item.innerText;
                            }
                            if (i == 1 && item.className == md.MYVALUE)
                           { 
                                dt.ITEM2 = item.innerText;
                                dt.ITEM3 = datetime;
                                ld.Add(dt);
                                dt = new dataitems();
                            }
                        }
                    }
                }
                //目的是去除不和规范的项
                ld=ld.Where(X => X.ITEM1!=null).ToList();
    
                foreach (dataitems tdt in ld)
                {
                    resultdatalist.Items.Add(tdt);
                }
            }
    
            private bool isint(string value)
            {
                return Regex.IsMatch(value, @"[0-9]");
            }
    

    MSHTML是微软公司的一个COM组件,该组件封装了HTML语言中的所有元素及其属性,通过其提供的标准接口,可以访问指定网页的所有元素。要使用它首先需要添加其引用,vs2017在程序集-扩展里面可以找到。


    2018-08-10_001627.png

    完整代码,仅供参考,没有写的很通用

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Threading.Tasks;
    using System.Windows;
    using System.Windows.Controls;
    using System.Windows.Data;
    using System.Windows.Documents;
    using System.Windows.Input;
    using System.Windows.Media;
    using System.Windows.Media.Imaging;
    using System.Windows.Navigation;
    using System.Windows.Shapes;
    using Microsoft.Office.Interop.Excel;
    using mshtml;
    
    
    namespace GetData
    {
        /// <summary>
        /// MainWindow.xaml 的交互逻辑
        /// </summary>
        public partial class MainWindow : System.Windows.Window
        {
            public class Mydata
            {
                private string mykey;
                private string myvalue;
    
                public string MYKEY
                {
                    get
                    {
                        return mykey;
                    }
                    set
                    {
                        mykey = value;
                    }
                }
    
                public string MYVALUE
                {
                    get
                    {
                        return myvalue;
                    }
                    set
                    {
                        myvalue = value;
                    }
                }
            }
    
            public class dataitems
            {
                private string item1;
                private string item2;
                private string item3;
                private string item4;
                private string item5;
                
                public string ITEM1
                {
                    get
                    {
                        return item1;
                    }
                    set
                    {
                        item1 = value;
                    }
                }
                public string ITEM2
                {
                    get
                    {
                        return item2;
                    }
                    set
                    {
                        item2 = value;
                    }
                }
                public string ITEM3
                {
                    get
                    {
                        return item3;
                    }
                    set
                    {
                        item3 = value;
                    }
                }
                public string ITEM4
                {
                    get
                    {
                        return item4;
                    }
                    set
                    {
                        item4 = value;
                    }
                }
                public string ITEM5
                {
                    get
                    {
                        return item5;
                    }
                    set
                    {
                        item5 = value;
                    }
                }
    
            }
    
    
    
    
            public MainWindow()
            {
                InitializeComponent();
            }
    
            /// <summary>
            /// 添加抓取字段
            /// </summary>
            /// <param name="sender"></param>
            /// <param name="e"></param>
            private void Button_Click(object sender, RoutedEventArgs e)
            {
                if (selfname.Text != string.Empty && selfvalue.Text != string.Empty)
                {
                    Mydata mydata = new Mydata();
    
                    mydata.MYKEY = selfname.Text;
                    mydata.MYVALUE = selfvalue.Text;
    
                    datalist.Items.Add(mydata);
    
                    selfname.Text = "";
                    selfvalue.Text = "";
                }
                else
                {
                    MessageBox.Show("字段和规则不能为空");
                }
            }
    
            /// <summary>
            /// 删除字段
            /// </summary>
            /// <param name="sender"></param>
            /// <param name="e"></param>
            private void Button_Click_1(object sender, RoutedEventArgs e)
            {
                if (null != datalist.SelectedItem)
                {
                    datalist.Items.Remove(datalist.SelectedItem);
                }
            }
    
            string datetime = string.Empty;
            private void Button_Click_2(object sender, RoutedEventArgs e)
            {
                if (datalist.Items.Count>0)
                {
                    //foreach (Mydata item in datalist.Items) {
                    //    GridViewColumn column = new GridViewColumn();
                    //    column.Header = item.MYKEY;
                    //    column.DisplayMemberBinding=
                    //    resultgridview.Columns.Add(column);
                    //}
    
                    for (int i = 0; i < datalist.Items.Count; i++)
                    {
                        Mydata md = datalist.Items[i] as Mydata;
                        resultgridview.Columns[i].Header = md.MYKEY;
                    }
                    if (siteurl.Text != string.Empty)
                    {
                        string[] arr = siteurl.Text.Split('/');
                        if (arr.Count() > 3)
                        {
                            string[] tmparr = arr[5].Split('.');
                            char[] tmpchar = tmparr[0].ToArray();
                            if (tmpchar.Count() == 6)
                            {
                                datetime = "20" + tmpchar[0] + tmpchar[1] + "/" + tmpchar[2] + tmpchar[3] + "/" + tmpchar[4] + tmpchar[5];
                            }
                        }
                        webbroser.Navigate(siteurl.Text);
                        webbroser.LoadCompleted -= Broserfinished;
                        webbroser.LoadCompleted += Broserfinished;
    
    
                    }
                }
    
    
                
            }
    
            List<dataitems> myld = new List<dataitems>();
            private void Broserfinished(object sender, NavigationEventArgs e)
            {
                var document = this.webbroser.Document as HTMLDocument;
                var items= document.getElementsByTagName("div");
    
    
                List<dataitems> ld = new List<dataitems>();
                dataitems dt = new dataitems();
                foreach (IHTMLElement item in items)
                {
                    for (int i = 0; i < datalist.Items.Count; i++)
                    {
                        Mydata md = datalist.Items[i] as Mydata;
                        if (item.innerText != null)
                        {
                            if (item.innerText.Contains("(") || !isint(item.innerText))
                            {
                                continue;
                            }
                            if (i == 0 && item.className == md.MYVALUE)
                            {
                                dt.ITEM1 = item.innerText;
                            }
                            if (i == 1 && item.className == md.MYVALUE)
                            {
                                
                                dt.ITEM2 = item.innerText;
                                dt.ITEM3 = datetime;
                                ld.Add(dt);
                                dt = new dataitems();
                            }
                        }
                    }
                }
    
                ld=ld.Where(X => X.ITEM1!=null).ToList();
    
                foreach (dataitems tdt in ld)
                {
                    resultdatalist.Items.Add(tdt);
                }
                
                
    
    
                //mshtml.HTMLDocument dom =(mshtml.HTMLDocument)webbroser.Document;
                //IHTMLDocument2 dom2 = (IHTMLDocument2)webbroser.DocumentText;
    
                //foreach (IHTMLElement item in dom2.all)
                //{
                //    if (item.className == selfvalue.Text)
                //    {
                //        MessageBox.Show(item.innerText);
                //    }
                //}
            }
    
            private void export_click(object sender, RoutedEventArgs e)
            {
                if (resultdatalist.SelectedItems.Count > 0)
                {
                    DateTime dtime =Convert.ToDateTime("2017-03-27");
                    System.Windows.Forms.SaveFileDialog sfd = new System.Windows.Forms.SaveFileDialog();
                    sfd.DefaultExt = "csv";
                    sfd.Filter = "Excel文件(*.csv)|*.csv";
                    sfd.RestoreDirectory = true;
                    sfd.CreatePrompt = false;
                    sfd.Title = "导出文件到";
                    sfd.ShowDialog();
                    string fileName = sfd.FileName;
    
                    Microsoft.Office.Interop.Excel.Application app = new Microsoft.Office.Interop.Excel.Application();
                    Workbook wk = app.Workbooks.Add(System.Type.Missing);
                    for (int i = 1; i <=resultdatalist.SelectedItems.Count; i++)
                    {
                        
                        dataitems dt = resultdatalist.SelectedItems[i-1] as dataitems;
                        int ColumnIndex = 1;
                        app.Cells[i, ColumnIndex++] = dt.ITEM1;
                        app.Cells[i, ColumnIndex++] = dt.ITEM2;
                        app.Cells[i, ColumnIndex++] = dt.ITEM3; 
                            
                    }
                    wk.SaveAs(fileName);  //将其进行保存到指定的路径  
                    wk.Close();
                }
    
                else
                {
                    MessageBox.Show("请选择导出内容");
                }
            }
    
            private bool isint(string value)
            {
                return Regex.IsMatch(value, @"[0-9]");
            }
        }
    }
    

    相关文章

      网友评论

        本文标题:【26】使用webbroser和mshtml解析网页

        本文链接:https://www.haomeiwen.com/subject/httkeftx.html