Register now and start sharing your code snippets.
-->

How to parse CSV data with Ruby

Ruby posted 2 months ago by christian

Parsing with plain Ruby

   1  filename = 'data.csv'
   2  file = File.new(filename, 'r')
   3  
   4  file.each_line('\n') do |row|
   5    columns = row.split(",")
   6    
   7    break if file.lineno > 10
   8  end

This option doesn’t support quoted text…

Parsing with the CSV library

   1  require 'csv'
   2  
   3  CSV.open('data.csv', 'r', ';') do |row|
   4    p row
   5  end
   6  

Parsing with the FasterCSV library

   1  require 'rubygems'
   2  require 'faster_csv'
   3  
   4  FasterCSV.foreach("data.csv", :quote_char => '"', :col_sep =>';', :row_sep =>:auto) do |row|
   5    puts row[0]
   6    break
   7  end

Tagged csv, parse, ruby, fastercsv

Perl script that can be used to calculate min, max, mean, mode, median and standard deviation for a set of log records

Perl posted 3 months ago by christian

The best thing about this script is that it’s easy to customize, right now it’s optimized for comma delimited data.

   1  use strict;
   2  use warnings;
   3  
   4  # Import stdev, average, mean and other statistical functions
   5  # A copy of http://search.cpan.org/~brianl/Statistics-Lite-3.2/Lite.pm
   6  do('stats.pl');
   7  
   8  my %page_runtimes;
   9  my $delimitor = ';';
  10  my @columns = ("page", "samples", "min", "max", "mean", "mode", "median", "stddev\n");
  11  my $line;
  12  my $first_timestamp, my $last_timestamp;
  13  
  14  # ==========================================
  15  # Parse log file
  16  # ==========================================
  17  foreach $line (<>) {
  18    # remove the newline from $line, otherwise the report will be corrupted.
  19    chomp($line);
  20  
  21    my @columns               = split(';', $line);
  22    my $timestamp             = $columns[0];
  23    my $page_name             = $columns[1];
  24    my $page_runtime          = $columns[2];
  25  
  26    if(!defined($first_timestamp))
  27    {
  28      $first_timestamp = $timestamp;
  29    }
  30  
  31    # print what we find
  32    if(!defined(@{$page_runtimes{$page_name}}))
  33    {
  34      print "Found page '$page_name'\n";
  35    }
  36   
  37    # add page runtimes to one hash
  38    push(@{$page_runtimes{$page_name}}, $page_runtime);
  39   
  40    $last_timestamp = $timestamp;
  41  }
  42  
  43  # ==========================================
  44  # Calculate and print page statistics
  45  # ==========================================
  46  open(PAGE_REPORT, ">report.csv") or die("Could not open report.csv.");
  47  
  48  print PAGE_REPORT "First sample\n".$first_timestamp."\nLast sample\n".$last_timestamp."\n\n";
  49  print PAGE_REPORT join($delimitor, @columns);
  50  
  51  for my $page_name (keys %page_runtimes )
  52  {
  53    my @runtimes = @{$page_runtimes{$page_name}};
  54   
  55    my $samples = @runtimes;
  56    my $min     = min(@runtimes);
  57    my $max     = max(@runtimes);
  58    my $mean    = mean(@runtimes);
  59    my $mode    = mode(@runtimes);
  60    my $median  = median(@runtimes);
  61    my $stddev  = stddev(@runtimes);
  62   
  63    my @data = ($page_name, $samples, $min, $max, $mean, $mode, $median, $stddev);
  64   
  65    my $line = join($delimitor, @data);
  66   
  67    # Use comma instead of decimal
  68    $line =~ s/\./\,/g;
  69   
  70    print PAGE_REPORT "$line\n";
  71  }
  72  close(PAGE_REPORT);

To use it simply pipe some data into it like this:

   1  grep "2008-31-12" silly-data.log | perl analyze.pl

Tagged csv, perl, min, max, mean, log, parser