urlinfo.pl

From Anna, 5 Years ago, written in Plain Text, viewed 893 times.
URL https://anna.fyi/view/2f8a0de3 Embed
Download Paste or View Raw
  1. use strict;
  2. use warnings;
  3. use 5.014;
  4. use utf8;
  5. use Encode;
  6. use Irssi;
  7. use POSIX ();
  8.  
  9. our $VERSION = "1.6";
  10. our %IRSSI = (
  11.     authors     => 'David Leadbeater',
  12.     contact     => 'dgl@dgl.cx',
  13.     name        => 'urlinfo',
  14.     description => 'Print short summaries about URLs from known services that are mentioned on IRC. (Including YouTube, etc.)',
  15.     license     => 'WTFPL <http://dgl.cx/licence>',
  16.     url         => 'http://dgl.cx/irssi',
  17. );
  18.  
  19. # This is needed so it can still run standalone for testing.
  20. BEGIN {
  21.   Irssi->import(20140628) if __PACKAGE__ =~ /Irssi/; # This needs Irssi 0.8.17
  22. }
  23.  
  24. BEGIN {
  25.   eval {
  26.     require HTML::TreeBuilder;
  27.     require URI;
  28.   } or do {
  29.     print "\x{3}8You need to install HTML::TreeBuilder and URI";
  30.     if (-f "/etc/debian_version") {
  31.       print "Try running: \x{3}9sudo apt-get install libhtml-treebuilder-perl libwww-perl";
  32.     }
  33.     die $@;
  34.   }
  35. }
  36.  
  37. # ------- Settings
  38. # /SET urlinfo_title_unknown ON|OFF
  39. #   Show title of all unknown sites. (There is a timeout to prevent against
  40. #   obvious resource exhaustion attacks, but remember this script has no
  41. #   warranty.)
  42. #
  43. # /SET urlinfo_timeout 10
  44. #   How many seconds after which to give up trying to fetch a URL
  45. #
  46. # /SET urlinfo_ignore_domains example\.org example\.com
  47. #   Space separated list of regular expressions of domains to ignore
  48. #
  49. # /SET urlinfo_ignore_targets freenode #something efnet/#example
  50. #   Space separated list of targets to ignore.
  51. #
  52. # /SET urlinfo_send_channels freenode #something efnet/#example
  53. #   Space separated list of targets to post in the channel.
  54. #
  55. # /SET urlinfo_custom_domains my\.domain/thing irssi\.org=description
  56. #   A limited way of configuring custom domains, if you need something more
  57. #   complex edit SITES below.
  58. #   Format: domain[/path[=from]]
  59.  
  60. # ------- Sites configuration
  61.  
  62. # This script aims to be data driven, this hash has "site_name => {site details}"
  63. # site details is a hash reference which can contain:
  64. #   cleanup: A regexp of text to remove from the resulting string
  65. #   domain: A string (or regexp, with qr//) of the domain to match (www. is
  66. #     removed automatically).
  67. #   from: Where to read the info "title", "description" (meta description) or
  68. #     a regexp to match the content (default "title")
  69. #   example: Example of this URL
  70. #   expected: What the example should return (see end for testing)
  71. #   items: An array ref of additional hashes to allow multiple of these values
  72. #     e.g.: items => [ { domain => "example.com" } ]
  73. #   path: Path component (string or regexp)
  74. #
  75. my %SITES = (
  76.   vimeo => {
  77.     cleanup => qr/\s*on Vimeo$/,
  78.     domain => "vimeo.com",
  79.     path => qr{/\d+},
  80.     example => "http://vimeo.com/80871338",
  81.     expected => "Journey Part 1",
  82.   },
  83.   youtube => {
  84.     cleanup => qr/\s*-\s*YouTube$/,
  85.     items => [
  86.       {
  87.         domain => "youtu.be",
  88.         example => "http://youtu.be/ghGoI7xVtSI",
  89.         expected => "Rick Astley - Never Gonna Give You Up (Live 1987)",
  90.       },
  91.       {
  92.         domain => "youtube.com",
  93.         path => "/watch",
  94.         example => "https://www.youtube.com/watch?v=ghGoI7xVtSI",
  95.         expected => "Rick Astley - Never Gonna Give You Up (Live 1987)",
  96.       },
  97.     ],
  98.   },
  99.   metacpan => {
  100.     cleanup => qr/\s*-\s*metacpan\.org$/,
  101.     domain => "metacpan.org",
  102.     path => qr{^/pod/},
  103.     example => "https://metacpan.org/pod/release/DOY/Reply-0.34/lib/Reply.pm",
  104.     from => "description",
  105.     expected => "read, eval, print, loop, yay!",
  106.   },
  107.   pypi => {
  108.     domain => "pypi.python.org",
  109.     path => qr{^/pypi/},
  110.     from => "description",
  111.     example => "https://pypi.python.org/pypi/stanford-corenlp-python/3.3.6-0",
  112.     expected => "A Stanford Core NLP wrapper (wordseer fork)",
  113.   },
  114.   gist => {
  115.     cleanup => qr/\s*-\s*Gist is .*$/,
  116.     domain => "gist.github.com",
  117.     from => ["og:title", "description"],
  118.     example => "https://gist.github.com/dgl/792206",
  119.     expected => "An install script that installs a development version of perl (from ".
  120.                 "git) and keeps a particular set of modules installed. Sort of ".
  121.                 "perlbrew for blead, but not quite.",
  122.   },
  123.   github => {
  124.     domain => "github.com",
  125.     items => [
  126.       { # issue, commit or pull
  127.         cleanup => qr/\s*·.*$/,
  128.         path => qr{^/[^/]+/[^/]+/(?:issues|commit|pull)/[a-f0-9]},
  129.         example => "https://github.com/irssi/irssi/commit/669add",
  130.         expected => "FS#155 hilight -tag",
  131.       },
  132.       { # user or project
  133.         from => "og:description",
  134.         path => qr{^/[^/]+(?:/[^/]+)?$},
  135.         example => "https://github.com/irssi/irssi",
  136.         expected => "irssi - The client of the future",
  137.       },
  138.     ],
  139.   },
  140. );
  141.  
  142. # ------- Site handling
  143.  
  144. sub expand {
  145.   my @expanded_sites;
  146.   for my $site(keys %SITES) {
  147.     expand_site(\@expanded_sites, $site, $SITES{$site}, {});
  148.   }
  149.   return @expanded_sites;
  150. }
  151.  
  152. # This essentially implements inheritance (via the "items" key), to reduce
  153. # duplication.
  154. sub expand_site {
  155.   my($expanded_sites, $site, $site_data, $current) = @_;
  156.   delete $current->{items};
  157.   my $s = {
  158.     name => $site,
  159.     from => "title",
  160.     %$current,
  161.     %$site_data,
  162.   };
  163.   if (exists $s->{items}) {
  164.     expand_site($expanded_sites, $site, $_, $s) for @{$s->{items}};
  165.   } else {
  166.     push @$expanded_sites, $s;
  167.   }
  168. }
  169.  
  170. sub _matcher {
  171.   my($site, $item) = @_;
  172.   return 1 unless defined $site;
  173.   return 1 if ref $site eq 'ARRAY' && grep _matcher($_, $item), @$site;
  174.   return 1 if ref $site && $site->isa("Regexp") && $item =~ $site;
  175.   return $site eq $item;
  176. }
  177.  
  178. sub get_site {
  179.   my($sites, $url) = @_;
  180.  
  181.   my $uri = URI->new($url);
  182.   $uri = URI->new("http://$url") unless $uri and $uri->scheme;
  183.   return unless $uri and $uri->can("host") and $uri->host and $uri->scheme =~ /^https?$/;
  184.  
  185.   for my $site(@$sites) {
  186.     my $match = 1;
  187.     $match &&= _matcher($site->{domain}, $uri->host =~ s/^www\.//ri);
  188.     $match &&= _matcher($site->{$_}, $uri->$_) for qw(scheme host path query fragment);
  189.     return $site, $uri if $match;
  190.   }
  191.  
  192.   if (Irssi::settings_get_bool("urlinfo_title_unknown")) {
  193.     return { name => "unknown", from => "title" }, $uri;
  194.   }
  195.  
  196.   return;
  197. }
  198.  
  199. my %from = (
  200.   title => sub {
  201.     $_[0]->look_down(_tag => 'title')->as_trimmed_text;
  202.   },
  203.   description => sub {
  204.     my $el = $_[0]->look_down(_tag => 'meta', name => 'description');
  205.     $el && $el->attr('content');
  206.   },
  207.   'og:description' => sub {
  208.     my $el = $_[0]->look_down(_tag => 'meta', property => 'og:description');
  209.     $el && $el->attr('content');
  210.   },
  211.   'og:title' => sub {
  212.     my $el = $_[0]->look_down(_tag => 'meta', property => 'og:title');
  213.     $el && $el->attr('content');
  214.   },
  215. );
  216.  
  217. sub get_info {
  218.   my($site, $uri) = @_;
  219.   my $from = $site->{from};
  220.   my $tree = HTML::TreeBuilder->new_from_url($uri);
  221.   my $info;
  222.   if (!ref $from || ref $from eq 'ARRAY') {
  223.     $info = join ": ", grep defined, map $from{$_}->($tree), ref $from ? @$from : $from;
  224.   } else {
  225.     $info = join "", $tree->as_html =~ $from;
  226.   }
  227.   $info =~ s/$site->{cleanup}// if $site->{cleanup};
  228.   $info =~ s/([\x00-\x19])/sprintf "\\x%x", ord $1/ger;
  229. }
  230.  
  231. # ------- IRC message handling
  232.  
  233. # John Gruber's URL regexp (nicely handles people putting URLs in parens, etc)
  234. my $URL_RE = qr{((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))};
  235.  
  236. my $pipe_in_progress;
  237. my @sites;
  238. my $timeout = 10;
  239.  
  240. sub msg {
  241.   my($server, $text, $nick, undef, $target) = @_;
  242.   # TODO: Add a queue / multiple pipe support?
  243.   return if $pipe_in_progress;
  244.  
  245.   my $msg_time = time;
  246.   my $tag = $server->{tag};
  247.   $target = $target || $nick;
  248.   $text = Irssi::strip_codes($text);
  249.  
  250.   if (my($url) = $text =~ $URL_RE) {
  251.     my($site, $uri) = get_site(\@sites, $url);
  252.     return unless $site;
  253.     return if ignored($uri, $server, $target);
  254.  
  255.     fork_wrapper(sub { # Child
  256.       my($fh) = @_;
  257.       syswrite $fh, "  " . encode_utf8(get_info($site, $uri));
  258.     },
  259.     sub { # Parent
  260.       my $in = decode_utf8($_[0]);
  261.       if ($in =~ s/^- //) {
  262.         print "\x{3}4urlinfo error:\x{3} $in";
  263.         return;
  264.       }
  265.       $in =~ s/^  //;
  266.       return unless $in;
  267.  
  268.       # Avoid reusing server just in case it is no longer valid
  269.       my $server = Irssi::server_find_tag($tag);
  270.       my $win = find_window($server, $target);
  271.  
  272.       my $view = $win->view;
  273.       my $line = $view->get_lines;
  274.       while ($line && ($line = $line->next)) {
  275.         if ($line->{info}->{time} >= $msg_time) {
  276.           if ($line->get_text(0) =~ /\Q$url/) {
  277.             last;
  278.           }
  279.         }
  280.       }
  281.  
  282.       my $timestamp = POSIX::strftime(
  283.         Irssi::settings_get_str("timestamp_format"), localtime $msg_time);
  284.       # I'm sure I shouldn't have to care about colours here...
  285.       my $pad = length Irssi::strip_codes($timestamp);
  286.  
  287.       if (not(send2channel($server,$target,$url,$in))) {
  288.         my $text = $win->format_get_text(__PACKAGE__, $server, $target,
  289.           "urlinfo", " " x $pad, $in);
  290.         $win->print_after($line, MSGLEVEL_NO_ACT|MSGLEVEL_CLIENTCRAP,
  291.           $text, $msg_time);
  292.         $view->redraw;
  293.       }
  294.     });
  295.   }
  296. }
  297.  
  298. sub send2channel {
  299.   my ($server,$target,$url,$in) =@_;
  300.   my @cl= split(" ",Irssi::settings_get_str('urlinfo_send_channels'));
  301.   my $s=0;
  302.  
  303.   foreach ( @cl) {
  304.     if ( $_ eq $target || $_ eq $server->{tag}."/".$target) {
  305.       $s=1;
  306.       $server->command("msg $target urlinfo: $in");
  307.       last;
  308.     }
  309.   }
  310.  
  311.   return $s;
  312. }
  313.  
  314. sub ignored {
  315.   my($uri, $server, $target) = @_;
  316.   my @ignored_domains = split / /, Irssi::settings_get_str('urlinfo_ignore_domains');
  317.   my $domain = $uri->host =~ s/^www\.//r;
  318.   return 1 if grep $domain =~ /^$_$/, @ignored_domains;
  319.  
  320.   my $chans = $server->isupport("chantypes") || '#&';
  321.   my $chan_match = qr/^[$chans]/;
  322.  
  323.   for my $ignored_target (split / /, Irssi::settings_get_str('urlinfo_ignore_targets')) {
  324.     my($mtag, $mtarget) = split m{/}, $ignored_target;
  325.     if ($mtag =~ $chan_match) {
  326.       $mtarget = $mtag;
  327.       $mtag = "*";
  328.     }
  329.     return 1 if _match($mtag, $server->{tag}) &&
  330.       (!$mtarget || _match($mtarget, $target));
  331.   }
  332.  
  333.   return 0;
  334. }
  335.  
  336. sub _match {
  337.   my($pattern, $name) = @_;
  338.   $pattern =~ s/\*/.*/g;
  339.   $name =~ /^$pattern$/i;
  340. }
  341.  
  342. sub find_window {
  343.   my($server, $target) = @_;
  344.   if (my $witem = $server->window_item_find($target)) {
  345.     return $witem->window;
  346.   } else {
  347.     # Maybe they have a msgs window?
  348.     my $win = Irssi::window_find_name("(msgs)");
  349.     # Ultimate fallback
  350.     $win = Irssi::window_find_refnum(1) unless $win;
  351.     return $win;
  352.   }
  353. }
  354.  
  355. # Based on scriptassist.
  356. sub fork_wrapper {
  357.   my($child, $parent) = @_;
  358.  
  359.   pipe(my $rfh, my $wfh);
  360.  
  361.   my $pid = fork;
  362.   $pipe_in_progress = 1;
  363.  
  364.   return unless defined $pid;
  365.  
  366.   if ($pid) {
  367.     close $wfh;
  368.     Irssi::pidwait_add($pid);
  369.     my $pipetag;
  370.     my @args = ($rfh, \$pipetag, $parent);
  371.     $pipetag = Irssi::input_add(fileno($rfh), Irssi::INPUT_READ, \&pipe_input, \@args);
  372.   } else {
  373.     eval {
  374.       local $SIG{ALRM} = sub { die "Timed out\n" };
  375.       alarm $timeout;
  376.       $child->($wfh);
  377.     };
  378.     alarm 0;
  379.     syswrite $wfh, encode_utf8("- $@") if $@;
  380.     POSIX::_exit(1);
  381.   }
  382. }
  383.  
  384. sub pipe_input {
  385.   my ($rfh, $pipetag, $parent) = @{$_[0]};
  386.   my $line = <$rfh>;
  387.   close($rfh);
  388.   Irssi::input_remove($$pipetag);
  389.   $pipe_in_progress = 0;
  390.   $parent->($line);
  391. }
  392.  
  393. sub setup_changed {
  394.   $timeout = Irssi::settings_get_int("urlinfo_timeout");
  395.  
  396.   @sites = expand();
  397.   for my $site (split / /, Irssi::settings_get_str("urlinfo_custom_domains")) {
  398.     next unless $site;
  399.  
  400.     my($re, $from) = split /=/, $site;
  401.     $from ||= "title";
  402.     my($domain, $path) = split m{/}, $re, 2;
  403.     expand_site(\@sites, "custom", {
  404.         domain => qr/^$domain$/,
  405.         path => defined $path ? qr/^\/$path/ : undef,
  406.         from => $from,
  407.     }, {});
  408.   }
  409. }
  410.  
  411. # ------- Initialization
  412.  
  413. if (caller) {
  414.   # Irssi specific initialization
  415.   require Irssi::TextUI;
  416.  
  417.   Irssi::settings_add_str($IRSSI{name}, "urlinfo_custom_domains", "");
  418.   Irssi::settings_add_str($IRSSI{name}, "urlinfo_ignore_domains", "");
  419.   Irssi::settings_add_str($IRSSI{name}, "urlinfo_ignore_targets", "");
  420.   Irssi::settings_add_str($IRSSI{name}, "urlinfo_send_channels", "");
  421.   Irssi::settings_add_int($IRSSI{name}, "urlinfo_timeout", $timeout);
  422.   Irssi::settings_add_bool($IRSSI{name}, "urlinfo_title_unknown", 0);
  423.  
  424.   Irssi::signal_add("message irc action" => \&msg);
  425.   Irssi::signal_add("message private" => \&msg);
  426.   Irssi::signal_add("message public" => \&msg);
  427.  
  428.   Irssi::signal_add_last("setup changed", \&setup_changed);
  429.   setup_changed();
  430.  
  431.   Irssi::theme_register([
  432.     'urlinfo' => '$0 %Kinfo:%n $1',
  433.   ]);
  434.  
  435. } else {
  436.   # Built in test. Run this script outside Irssi to use.
  437.   @sites = expand();
  438.   for my $site(@sites) {
  439.     next unless $site->{example};
  440.     my($found_site, $uri) = get_site(\@sites, $site->{example});
  441.     if ($found_site != $site) {
  442.       die "Got $found_site->{name}, expected $site->{name}";
  443.     }
  444.     say "Get $uri";
  445.     my $result = get_info($site, $uri);
  446.     say $result;
  447.     die "Got $result, expected $site->{expected}" unless $result eq $site->{expected};
  448.   }
  449.   say "OK";
  450. }
  451.  

Reply to "urlinfo.pl"

Here you can reply to the paste above