資源簡介
利用DOM-TREE模型對網頁進行表示
對原始網頁進行修正缺省標簽的補充等
利用網頁正文提取方法對網頁進行正文提取,去除網頁中的噪聲信息,提取出網頁中的正文、相關超鏈接

代碼片段和文件信息
using?System;
using?System.IO;
using?System.Collections.Generic;
using?System.ComponentModel;
using?System.Data;
using?System.Drawing;
using?System.Linq;
using?System.Text;
using?System.Windows.Forms;
using?mshtml;
using?System.Runtime.InteropServices;
[ComVisible(true)?ComImport()?Guid(“7FD52380-4E07-101B-AE2D-08002B2EC713“)?InterfaceTypeAttribute(ComInterfaceType.InterfaceIsIUnknown)]
public?interface?IPersistStreamInit
{
????void?GetClassID([In?Out]?ref?Guid?pClassID);
????[return:?MarshalAs(UnmanagedType.I4)]
????[PreserveSig]
????int?IsDirty();
????void?Load([In?MarshalAs(UnmanagedType.Interface)]?UCOMIStream?pstm);
????void?Save([In?MarshalAs(UnmanagedType.Interface)]?UCOMIStream?pstm
?????[In?MarshalAs(UnmanagedType.I4)]?int?fClearDirty);
????void?GetSizeMax([Out?MarshalAs(UnmanagedType.LPArray)]?long?pcbSize);
????void?InitNew();
}??
namespace?WindowsFormsApplication1
{
????public?partial?class?Form1?:?Form
????{
????????public?Form1()
????????{
????????????InitializeComponent();?
????????????webBrowser1.Navigate(“HttpstyleUriParser://www.baidu.com“);?
????????}
????????private?void?webBrowser1_DocumentCompleted(object?sender?WebBrowserDocumentCompletedEventArgs?e)
????????{
????????}
????????private?void?treeView1_AfterSelect(object?sender?TreeViewEventArgs?e)
????????{
????????}
????????private?void?button1_Click(object?sender?EventArgs?e)
????????{
????????????if?(webBrowser1.Document?!=?null)
????????????{???//獲取html?
????????????????StreamReader?sr?=?new?StreamReader(webBrowser1.DocumentStream?Encoding.GetEncoding(“gb2312“));
????????????????String?html?=?sr.ReadToEnd();
????????????????richTextBox1.Text?=?html;
????????????????//獲取dom樹?
????????????????IHTMLDocument2?doc2?=?Parse(html);
????????????????IHTMLDocument3?htmldocument?=?(IHTMLDocument3)doc2;
????????????????IHTMLDOMNode?rootDomNode?=?(IHTMLDOMNode)htmldocument.documentElement;??//獲取Dom樹?
????????????????TreeNode?root?=?treeView1.Nodes.Add(“HTML“);??//跟節點?
????????????????InsertDOMNodes(rootDomNode?root);??//把其他節點插入到跟節點中?
????????????}
????????????else
????????????{
????????????????MessageBox.Show(“webbrowser為空“);
????????????}??
????????}
????????unsafe?IHTMLDocument2?Parse(string?s)???????//unsafe關鍵字表示不安全上下文,該上下文是任何涉及指針的操作所必需的。???
????????{
????????????IHTMLDocument2?pDocument?=?new?HTMLDocumentClass();
????????????if?(pDocument?!=?null)
????????????{
????????????????IPersistStreamInit?pPersist?=?pDocument?as?IPersistStreamInit;??//as運算符類似于強制轉換操作;如果轉換不可行,as會返回null而不是引發異常。?
????????????????pPersist.InitNew();
????????????????pPersist?=?null;
????????????????IMarkupServices?ms?=?pDocument?as?IMarkupServices;
????????????????if?(ms?!=?null)
????????????????{
????????????????????IMarkupContainer?pMC?=?null;
????????????????????IMarkupPointer?pStart?pEnd;
????????????????????ms.CreateMarkupPointer(out?pStart);
????????????????????ms.CreateMarkupPointer(o
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件??????12288??2010-11-04?11:24??WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.exe
?????文件??????28160??2010-11-04?11:24??WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.pdb
?????文件??????14328??2010-11-04?11:25??WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.vshost.exe
?????文件????????490??2007-07-21?01:33??WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.vshost.exe.manifest
?????文件???????5357??2010-11-04?11:24??WindowsFormsApplication1\WindowsFormsApplication1\Form1.cs
?????文件???????3966??2010-11-04?11:24??WindowsFormsApplication1\WindowsFormsApplication1\Form1.Designer.cs
?????文件???????5814??2010-11-04?11:24??WindowsFormsApplication1\WindowsFormsApplication1\Form1.resx
?????文件???????1387??2010-11-04?11:25??WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.csproj.FileListAbsolute.txt
?????文件????????847??2010-11-04?11:24??WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.csproj.GenerateResource.Cache
?????文件??????12288??2010-11-04?11:24??WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.exe
?????文件????????180??2010-11-04?11:24??WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.Form1.resources
?????文件??????28160??2010-11-04?11:24??WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.pdb
?????文件????????180??2010-11-04?11:21??WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.Properties.Resources.resources
?????文件????????516??2010-11-04?11:01??WindowsFormsApplication1\WindowsFormsApplication1\Program.cs
?????文件???????1466??2010-11-04?10:18??WindowsFormsApplication1\WindowsFormsApplication1\Properties\AssemblyInfo.cs
?????文件???????2877??2010-11-04?10:18??WindowsFormsApplication1\WindowsFormsApplication1\Properties\Resources.Designer.cs
?????文件???????5612??2010-11-04?10:18??WindowsFormsApplication1\WindowsFormsApplication1\Properties\Resources.resx
?????文件???????1109??2010-11-04?10:18??WindowsFormsApplication1\WindowsFormsApplication1\Properties\Settings.Designer.cs
?????文件????????249??2010-11-04?10:18??WindowsFormsApplication1\WindowsFormsApplication1\Properties\Settings.settings
?????文件???????4162??2010-11-04?11:20??WindowsFormsApplication1\WindowsFormsApplication1\WindowsFormsApplication1.csproj
?????文件??????12288??2010-11-04?11:24??WindowsFormsApplication1\WindowsFormsApplication1.exe
?????文件????????962??2010-11-04?10:18??WindowsFormsApplication1\WindowsFormsApplication1.sln
????..A..H.?????18944??2010-11-04?11:25??WindowsFormsApplication1\WindowsFormsApplication1.suo
?????目錄??????????0??2010-11-04?11:26??WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\TempPE
?????目錄??????????0??2010-11-04?11:26??WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug
?????目錄??????????0??2010-11-04?11:26??WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug
?????目錄??????????0??2010-11-04?11:26??WindowsFormsApplication1\WindowsFormsApplication1\bin
?????目錄??????????0??2010-11-04?11:26??WindowsFormsApplication1\WindowsFormsApplication1\obj
?????目錄??????????0??2010-11-04?11:26??WindowsFormsApplication1\WindowsFormsApplication1\Properties
?????目錄??????????0??2010-11-04?11:26??WindowsFormsApplication1\WindowsFormsApplication1
............此處省略4個文件信息
評論
共有 條評論