[Tkabber-dev] [tclxmpp] r121 committed - * examples/rssbot.tcl: Get XML encoding from XML document itself and...

codesite-noreply at google.com codesite-noreply at google.com
Sat Oct 10 21:58:37 MSD 2009


Revision: 121
Author: sgolovan
Date: Sat Oct 10 10:57:46 2009
Log: 	* examples/rssbot.tcl: Get XML encoding from XML document itself and
	  not from HTTP header. Also, strip HTML markup from items description.

http://code.google.com/p/tclxmpp/source/detail?r=121

Modified:
  /trunk/ChangeLog
  /trunk/examples/rssbot.tcl

=======================================
--- /trunk/ChangeLog	Sun Oct  4 04:31:25 2009
+++ /trunk/ChangeLog	Sat Oct 10 10:57:46 2009
@@ -1,3 +1,8 @@
+2009-10-10  Sergei Golovan  <sgolovan at nes.ru>
+
+	* examples/rssbot.tcl: Get XML encoding from XML document itself and
+	  not from HTTP header. Also, strip HTML markup from items description.
+
  2009-10-04  Sergei Golovan  <sgolovan at nes.ru>

  	* examples/jsend.tcl: Restored historical name and added -date option
=======================================
--- /trunk/examples/rssbot.tcl	Sun Oct  4 04:31:25 2009
+++ /trunk/examples/rssbot.tcl	Sat Oct 10 10:57:46 2009
@@ -15,9 +15,9 @@

  package require Tcl 8.5
  package require http 2
-package require mime
  package require tls
  package require uri
+package require htmlparse

  package require xmpp
  package require xmpp::auth
@@ -93,12 +93,9 @@
          }
      }

-    array set aprops [lindex [mime::parseaddress $options(-from)] 0]
-    if {[set x [string first / $aprops(domain)]] >= 0} {
-        set aprops(resource) [string range $aprops(domain) [expr {$x + 1}]  
end]
-        set aprops(domain) [string range $aprops(domain) 0 [expr {$x - 1}]]
-    } else {
-        set aprops(resource) "rssbot"
+    ::xmpp::jid::split $options(-from) node domain resource
+    if {[string equal $resource ""]} {
+        set resource "rssbot"
      }

      set options(-xlist) {}
@@ -150,35 +147,35 @@
          }

          # Connect to a server
-        ::xmpp::connect $xlib $aprops(domain) $port -transport $transport
+        ::xmpp::connect $xlib $domain $port -transport $transport

          if {!$options(-tls) && $options(-starttls)} {
              # Open XMPP stream
-            set sessionID [::xmpp::openStream $xlib $aprops(domain) \
+            set sessionID [::xmpp::openStream $xlib $domain \
                                                      -version 1.0]

              ::xmpp::starttls::starttls $xlib

-            ::xmpp::sasl::auth $xlib -username  $aprops(local) \
+            ::xmpp::sasl::auth $xlib -username  $node \
                                       -password  $options(-password) \
-                                     -resource  $aprops(resource)
+                                     -resource  $resource
          } elseif {$options(-sasl)} {
              # Open XMPP stream
-            set sessionID [::xmpp::openStream $xlib $aprops(domain) \
+            set sessionID [::xmpp::openStream $xlib $domain \
                                                      -version 1.0]

-            ::xmpp::sasl::auth $xlib -username  $aprops(local) \
+            ::xmpp::sasl::auth $xlib -username  $node \
                                       -password  $options(-password) \
-                                     -resource  $aprops(resource)
+                                     -resource  $resource
          } else {
              # Open XMPP stream
-            set sessionID [::xmpp::openStream $xlib $aprops(domain)]
+            set sessionID [::xmpp::openStream $xlib $domain]

              # Authenticate
              ::xmpp::auth::auth $xlib -sessionid $sessionID \
-                                     -username  $aprops(local) \
+                                     -username  $node \
                                       -password  $options(-password) \
-                                     -resource  $aprops(resource)
+                                     -resource  $resource
          }

          set roster [::xmpp::roster::new $xlib]
@@ -476,7 +473,10 @@
              continue
          }

-        if {[catch { ::http::geturl $site } httpT]} {
+        # Sometimes the RSS encoding can be application/xml instead of  
text/xml,
+        # so treat all data as binary and recode it separately
+
+        if {[catch { ::http::geturl $site -binary 1 } httpT]} {
              ::LOG "$site: $httpT"
              continue
          }
@@ -500,7 +500,7 @@
                          set mtime $t
                      }
                      foreach {k v} [process $site $mtime [expr {$now +  
(30*60)}] \
-                                           $now [::http::data $httpT]] {
+                                           $now [recodeXML [::http::data  
$httpT]]] {
                          if {$v} {
                              set updateP($k) 1
                          }
@@ -661,9 +661,13 @@
                  ::LOG "DEBUG $info(site): article for $url already exists"
                  return
              }
+
+            set info(description) [removeHTTPMarkup $info(description)]

              if {![string compare $info(body) ""]} {
                  set info(body) [string  
trim "$info(description)\n$info(url)"]
+            } else {
+                set info(body) [removeHTTPMarkup $info(body)]
              }

              set args {}
@@ -1042,6 +1046,90 @@
          bgerror $result
      }
  }
+
+proc rssbot::removeHTTPMarkup {html} {
+    set text ""
+    ::htmlparse::parse \
+        -cmd [namespace code [list processHTTPTag [info level] text]] $html
+    return $text
+}
+
+proc rssbot::processHTTPTag {level var tag slash attrs cdata} {
+    upvar #$level $var text
+
+    set cdata [regsub {\s+} [::htmlparse::mapEscapes $cdata] { }]
+
+    switch -glob -- $tag:$slash {
+        p: -
+        br: {
+            append text "\n" $cdata
+        }
+        tr: {
+            append text "\n"
+        }
+        th: -
+        td: {
+            append text "\t" $cdata
+        }
+        default {
+            append text $cdata
+        }
+    }
+}
+
+# The following code is mostly taken from http://wiki.tcl.tk/15326
+
+proc rssbot::recodeXML {xml} {
+    # The autodetection of the encoding follows
+    # XML Recomendation, Appendix F
+
+    set closeIndex 0
+
+    if {![binary scan $xml "H8" firstBytes]} {
+        # very short (< 4 Bytes) file
+        set encoding utf-8
+    }
+
+    # If the entity has a XML Declaration, the first four characters
+    # must be "<?xm".
+    switch $firstBytes {
+        "3c3f786d" {
+            # UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS,
+            # EUC, or any other 7-bit, 8-bit, or mixed-width encoding which
+            # ensures that the characters of ASCII have their normal  
positions,
+            # width and values; the actual encoding declaration must be  
read to
+            # detect which of these applies, but since all of these  
encodings
+            # use the same bit patterns for the ASCII characters, the  
encoding
+            # declaration itself can be read reliably.
+
+            # Try to find the end of the XML Declaration
+            set closeIndex [string first ">" $xml]
+            if {$closeIndex < 0} {
+                error "Weird XML data or not XML data at all"
+            }
+
+            set xmlDeclaration [string range $xml 0 $closeIndex]
+            incr closeIndex
+            # extract the encoding information
+            set pattern {^[^>]+encoding=[\x20\x9\xd\xa]*["']([^ "']+)['"]}
+            # emacs or vim: "
+            if {![regexp $pattern $xmlDeclaration - encStr]} {
+                # Probably something like <?xml version="1.0"?>.
+                # Without encoding declaration, pass-thru
+                set encoding utf-8
+            } else {
+                set encoding [::http::CharsetToEncoding $encStr]
+            }
+        }
+        default {
+            # TODO: 16 and 32-bit encodings
+            # pass-thru
+            set encoding iso8859-1
+        }
+    }
+
+    return [encoding convertfrom $encoding [string range $xml $closeIndex  
end]]
+}

  # The following code is taken from http://wiki.tcl.tk/13094

@@ -1193,11 +1281,9 @@
  # The following code is taken from http://wiki.tcl.tk/13094

  namespace eval rfc2822 {
-
      namespace export parseDate

      variable datepats {}
-
  }

  # AddDatePat --
@@ -1223,9 +1309,8 @@
  #       Adds a complete regexp and a complete [clock scan] pattern to
  #       'datepats'

-proc rfc2822::AddDatePat { wpat wgrp ypat ygrp mdpat mdgrp
+proc rfc2822::AddDatePat { wpat wgrp ypat ygrp mdpat mdgrp
                             spat sgrp zpat zgrp } {
-
      variable datepats
      set regexp {^[[:space:]]*}
      set pat {}
@@ -1239,7 +1324,7 @@
      lappend datepats $regexp $pat
      return
  }
-
+
  # InitDatePats --
  #
  #       Internal rocedure that initializes the set of date patterns  
allowed in
@@ -1255,29 +1340,26 @@
  # Side effects:

  proc rfc2822::InitDatePats { permissible } {
-
      # Produce formats for the observed variants of ISO2822 dates.   
Permissible
      # variants come first in the list; impermissible ones come later.
-
+
      # The month and day may be "%b %d" or "%d %b"
-
-    foreach mdpat {{[[:alpha:]]+[[:space:]]+\d\d?}
+
+    foreach mdpat {{[[:alpha:]]+[[:space:]]+\d\d?}
          {\d\d?[[:space:]]+[[:alpha:]]+}} \
          mdgrp {{%b %d} {%d %b}} \
          mdperm {0 1} {

-            # The year may be two digits, or four. Four digit year is done
+            # The year may be two digits, or four. Four digit year is done
              # first.
-
+
              foreach ypat {{\d\d\d\d} {\d\d}} ygrp {%Y %y} {
-
                  # The seconds of the minute may be provided, or omitted.
-
+
                  foreach spat {{:\d\d} {}} sgrp {:%S {}} {
-
                      # The weekday may be provided or omitted. It is common  
but
                      # impermissible to omit the comma after the weekday  
name.
-
+
                      foreach wpat {
                          {(?:Mon|T(?:ue|hu)|Wed|Fri|S(?:at| 
un)),[[:space:]]+}
                          {(?:Mon|T(?:ue|hu)|Wed|Fri|S(?:at|un))[[:space:]]+}
@@ -1291,15 +1373,14 @@
                          0
                          1
                      } {
-
                          # Time zone is defined as +/- hhmm, or as a
                          # named time zone.  Other common but buggy
                          # formats are GMT+-hh:mm, a time zone name in
                          # quotation marks, and complete omission of
                          # the time zone.
-
+
                          foreach zpat {
-                            {[[:space:]]+(?:[-+]\d\d\d\d|[[:alpha:]]+)}
+                            {[[:space:]]+(?:[-+]\d\d\d\d|[[:alpha:]]+)}
                              {[[:space:]]+GMT[-+]\d\d:?\d\d}
                              {[[:space:]]+"[[:alpha:]]+"}
                              {}


More information about the Tkabber-dev mailing list