• The eternal confessions of a beautiful mind...
  • DamianM.Co.UK
  • Home
  • About
  • Archives
  • Contact
  • Sitemap
  • My Flickr

    IMG_9585IMG_9115IMG_9113IMG_9111IMG_9078IMG_9075IMG_9069IMG_9065IMG_9041IMG_9032IMG_8963IMG_8928IMG_8916IMG_8915IMG_8904IMG_8876IMG_8858IMG_8830IMG_8828IMG_8826

  • Recent Posts

    • Excuse my French
    • A good catholic lightswitch
    • What they would look like if they weren’t famous!!
    • Funniest Valedictorian Speech Ever
    • Can you spot the bands?
    • Strictly for the office
    • Failing the exam with dignity
    • New Ohio Roller Coaster - INSANE!!!
    • Speeding
    • Captions not required
    • Unfortunate Backgrounds
    • Unfortunate Signs
    • OMG!!
    • Women as explained by engineer
    • The Monty Hall Problem
  • My Tools

    • Blog_LinkIt
    • DCoda Theme
    • DCoda Widgets
    • RSS_Sticky
    • WordPress.org
    • WP_BlogNetworking
    • WP_BoilerPlate
    • WP_Censor
    • WP_ContactMe
    • WP_DeliciousPost
    • WP_EasyReply
    • WP_HeadNFoot
    • WP_LinkIt
    • WP_LinkSync
    • WP_OneInstall
    • WP_PostDate
    • WP_PostNotes
    • WP_RssSticky
    • WP_Spoiler
    • WP_Submission
  • My Web

    • ASPAlliance
    • ClaimID
    • del.ico.us
    • Digg
    • DSLRBlog
    • DVDProfiler
    • Flickr
    • Honeyed SPAM
    • My Blog
    • My company
    • MYSpace
    • WordPress.org
    • YouTube

    News

    News.aspx

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    
    <%@ Control Language="vb" AutoEventWireup="false" Codebehind="News.ascx.vb" Inherits="DamianM.News" %>;
    <asp:datalist id="dlNews" runat="server" repeatcolumns="2" borderwidth="1" itemstyle-wrap="true" cellspacing="5" cellpadding="10" headerstyle-backcolor="#CC4444" headerstyle-font-bold="true" headerstyle-font-size="20" headerstyle-forecolor="White" itemstyle-font-name="Georgia" headerstyle-font-name="Georgia" itemstyle-backcolor="#EEEEEE" itemstyle-verticalalign="Top" itemstyle-horizontalalign="Left" width="300" showheader="true" enableviewstate="false">
     
    		UK News <img src="http://www.pixunlimited.co.uk/guardian/site/distribution/buttons/guardianUnlim.gif" />
     
    	<itemtemplate>
    		<strong><a href="http://www.damianm.co.uk/wp-admin/%3C%#%20DataBinder.Eval%28Container.DataItem,%20%22strURL%22%29%20%%3E">
    				<%# DataBinder.Eval(Container.DataItem, "strHeadline") %>;
    			</a></strong>
     
    		<%# DataBinder.Eval(Container.DataItem, "strSummary") %>;
    	</itemtemplate>
    </asp:datalist>

    News.aspx.vb

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    
    'you may notice that no namespaces have been imported
    'all Classes are referred to in full for demonstration purposes
    Public MustInherit Class News
        Inherits System.Web.UI.UserControl
        Protected WithEvents dlNews As System.Web.UI.WebControls.DataList
     
        'a pretty average standard data scraping function
        Private Function getHTML(ByVal strURL As String) As String
            Dim WebReq As System.Net.WebRequest
            WebReq = System.Net.WebRequest.Create(strURL)
     
            Dim strNews As String
            Try
                Dim WebRes As System.Net.WebResponse
                WebRes = WebReq.GetResponse()
     
                Dim WebStream As System.IO.StreamReader
                WebStream = New System.IO.StreamReader(WebRes.GetResponseStream(), System.Text.Encoding.UTF7)
                strNews = WebStream.ReadToEnd()
                WebRes.Close()
            Catch
                strNews = ""
            End Try
            getHTML = strNews
        End Function
     
        Private Function getNews() As System.Data.DataTable
            Dim rowNewsItem As System.Data.DataRow
     
            'create the table to be returned
            getNews = New System.Data.DataTable()
            getNews.Columns.Add("strURL")
            getNews.Columns.Add("strHeadline")
            getNews.Columns.Add("strSummary")
     
            'set up the regular expression for the news page
            Dim strRegex As String
            strRegex = "<A HREF='(?<strURL>[^']+)'[\s]*?>(?<strHeadline>[^<]+)</A>[\s\w\W]*?<BR>(?<strSummary>[^<]+)<"
            Dim Regex As System.Text.RegularExpressions.Regex
            Regex = New System.Text.RegularExpressions.Regex(strRegex, System.Text.RegularExpressions.RegexOptions.Compiled)
     
            'scrape the data
            Dim Matches As System.Text.RegularExpressions.MatchCollection = Regex.Matches(getHTML("http://www.guardian.co.uk/syndication/service/0,11065,331-0-5,00.html"))
            Dim Match As System.Text.RegularExpressions.Match
     
            'loop through all matches filling out the table as you go
            For Each Match In Matches
                rowNewsItem = getNews.NewRow()
                rowNewsItem("strURL") = Match.Groups("strURL").Value
                rowNewsItem("strHeadline") = Match.Groups("strHeadline").Value
                rowNewsItem("strSummary") = Match.Groups("strSummary").Value
                getNews.Rows.Add(rowNewsItem)
            Next
        End Function
     
        Public Property width() As System.Web.UI.WebControls.Unit
            Get
                Return dlNews.Width
            End Get
            Set(ByVal Value As System.Web.UI.WebControls.Unit)
                dlNews.Width = Value
            End Set
        End Property
     
        'add properties to change header colour , width and number of columns
        Public Property cols() As Integer
            Get
                Return dlNews.RepeatColumns
            End Get
            Set(ByVal Value As Integer)
                dlNews.RepeatColumns = Value
            End Set
        End Property
     
        Public Property HeaderStyleBackColor() As System.Drawing.Color
            Get
                Return dlNews.HeaderStyle.BackColor()
            End Get
            Set(ByVal Value As System.Drawing.Color)
                dlNews.HeaderStyle.BackColor = Value
            End Set
        End Property
     
        Private Sub Page_Load(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles MyBase.Load
            dlNews.DataSource = getNews()
            dlNews.DataBind()
        End Sub
    End Class

    2 Responses to “News”

    1. DamianM » Blog Archive » Screen Scraping Lists Says:
      June 15th, 2007 at 6:42 pm

      [...] There are many articles about data scraping, concerning returning an entire page or a particular element. Building on the base of the other articles, we will be using the grouping constructs to retrieve easily a list of headlines from Guardian Unlimited. You will find the function to scrape the HTML functionally the same as with the other articles, but with the addition try-catch statement, as you can never be too cautious when using resources outside of your control. News Source Code [...]

    2. xqhsfzmooq Says:
      August 14th, 2007 at 9:20 pm

      Hello! Good Site! Thanks you! spdkqszarfsh

    Leave a Reply

    Related Posts from the Past:

    • Hello, I must be going.
    • Safari? So Goody?
    • Screen Scraping Lists