Pontifications

# ascending sort an array of arrays by the id which is the first element of each array
sorted_array =  
id_time_url_title_content_tags_array.sort_by { |h| h[0] } 
  • 2. multi-line regular expressions use / and /x e.g.
osx_regexp= 
/
  (os\sx|mojave|catalina|macos|elcapitan|osx|mac\sos|
  	el\scapitan|sierra|yosemite|mavericks|
    macbook|imac|powermac|macpro|mac\spro|macintosh)
/x

How to use the above script

cd 201907
../get-questions-for-1-month.rb 2019 7
../print-desktop-en-us-osx-increasing-ids-time-url-title-content.rb  \
2019-07-firefox-desktop-all-locales.csv  2>/tmp/foo.txt

Code

#!/usr/bin/env ruby
require 'json'
require 'rubygems'
require 'awesome_print'
require 'json'
require 'time'
require 'date'
require 'csv'
require 'logger'
require 'nokogiri'

logger = Logger.new(STDERR)
logger.level = Logger::DEBUG

if ARGV.length < 1
  puts "usage: #{$0} [sumoquestions.csv]"   
  exit
end

FILENAME = ARGV[0]
osx_regexp_tags = 
/
  (?:os-x|mojave|catalina|macos|elcapitan|osx|mac-os|\
  	el-capitan|sierra|yosemite|mavericks)
/x
osx_regexp= 
/
  (os\sx|mojave|catalina|macos|elcapitan|osx|mac\sos|
  	el\scapitan|sierra|yosemite|mavericks|
    macbook|imac|powermac|macpro|mac\spro|macintosh)
/x
num_osx_questions  = 0
id_time_url_title_content_tags_array = []
CSV.foreach(FILENAME, :headers => true) do |row|
  hash = {}
  content = ""
  logger.debug row['tags']
  logger.debug row['title']
  next if row['locale'] != "en-US" || row['product'] != 'firefox'
  found_in_title_or_content = false
  if osx_regexp.match(row['title']) 
  	logger.debug "FOUND os x in title"
  	found_in_title_or_content = true
  end	
  
  if !found_in_title_or_content 
  	content  = Nokogiri::HTML.fragment(row['content']).text 
  	logger.debug 'CONTENT:' + content
  	if osx_regexp.match(content) 
  		logger.debug "FOUND os x in content"
  		found_in_title_or_content = true
  	end
  end
  next if !osx_regexp_tags.match(row['tags']) if !found_in_title_or_content
  num_osx_questions += 1

  id_time_url_title_content_tags_array.push(
    [
      row['id'].to_i,
      Time.at(row["created"].to_i).strftime("%-m/%-d/%Y %H:%M:%S"), # 10/2/2019 20:34:35
      "https://support.mozilla.org/questions/" + row['id'].to_s,
      row['title'],
      content[0..79],
      row["tags"]
    ])
end
logger.debug 'num_osx_questions:' + num_osx_questions.to_s
sorted_array =  id_time_url_title_content_tags_array.sort_by { |h| h[0] }
headers = ['id', 'created', 'url', 'title', 'content', 'tags']

FILENAME = sprintf("sorted-osx-desktop-en-us-%s", ARGV[0])
CSV.open(FILENAME, "w", write_headers: true, headers: headers) do |csv_object|
  sorted_array.each {|row_array| csv_object << row_array }
end

Leave a comment on github