123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202 |
- #!/usr/bin/env ruby
- =begin
- post_fsd_wiki.phantomjs
- this script is used to validate and report statistics on the blacklist entries
- it currently detects syntax errors, missing tags, unspecified tags, missing descriptions,
- duplicate entries for a single package (partitioning them as identical or differing),
- and 'neglected_entries' which are those with no tag, no description, and no replacement
- it optionally creates a JSON file with the results
- that can be the input to the post_fsd_wiki.phantomjs script
- =end
- # DEBUG begin
- DEBUG = false
- require 'byebug' if DEBUG
- DEBUG_FILTER_NAMES = [ 'vapoursynth-plugin-fluxsmooth' ]
- def IS_DEBUG_FILTER_NAME name ; DEBUG_FILTER_NAMES.include? name ; end ;
- # DEBUG end
- require 'json'
- require 'set'
- # entry syntax => original-package:[libre-replacement]:[ref]:[id]:short-description
- BLACKLIST_FILES = [ 'blacklist.txt' ]
- # BLACKLIST_FILES = [ 'blacklist-testdata.txt' ]
- VALID_ENTRIES_REGEX = /^[^:\[\]#]*:[^:\[\]]*:(sv|debian|parabola|fsf|fedora)?:[^:\[\]]*:\w*([^\d:]+:.*|\[[^:]+:.*|[^:]*)$/
- BLACKLIST_TAGS_REGEX = /^\[([^\]]*)\] *(.*)/
- RAW_ENTRY_KEY = :raw_blacklist_entry
- PACKAGE_NAME_KEY = :original_package # per blacklist SYNTAX
- REPLACEMENT_KEY = :libre_replacement # per blacklist SYNTAX
- REFERENCE_KEY = :ref # per blacklist SYNTAX
- ENTRY_ID_KEY = :id # per blacklist SYNTAX
- DESCRIPTION_KEY = :short_description # per blacklist SYNTAX
- BLACKLIST_TAGS_KEY = :blacklist_tags
- NONFREE_TAG = 'nonfree'
- SEMIFREE_TAG = 'semifree'
- USES_NONFREE_TAG = 'uses-nonfree'
- BRANDING_TAG = 'branding'
- TECHNICAL_TAG = 'technical'
- HAS_REPLACEMENT_TAG = 'FIXME:package'
- NEEDS_DESC_TAG = 'FIXME:description'
- ACCEPTABLE_TAGS = [ NONFREE_TAG , SEMIFREE_TAG , USES_NONFREE_TAG , BRANDING_TAG ,
- TECHNICAL_TAG , HAS_REPLACEMENT_TAG , NEEDS_DESC_TAG ]
- DO_PRINT_STATS = true
- DO_PRINT_INCOMPLETE = true
- DO_PRINT_DUPLICATE = false
- REPORT_SEPARATOR = "------------------------------------------------------------\n"
- OUTPUT_JSON_FILE = 'blacklist-data.json'
- entries_invalid = []
- entries = []
- entry_freqs = {}
- entries_no_desc = []
- entries_no_tags = []
- entries_unspecified_tags = []
- unspecified_tags = Set[]
- duplicate_names = nil # deferred
- duplicate_identical_entries = {}
- duplicate_differing_entries = {}
- ## parse data ##
- BLACKLIST_FILES.each do | blacklist_filename |
- if DEBUG ; print "\nDEBUG: parsing #{blacklist_filename}\n" ; end ;
- (File.readlines blacklist_filename).each do | line |
- # DEBUG_FILTER_NAMES.each { | ea | debugger if line.start_with? ea }
- next if line.strip.empty? || (line.strip.start_with? '#')
- entries_invalid << line && next unless line.match VALID_ENTRIES_REGEX
- entries << (entry = {})
- tokens = (line.split ':')
- if DEBUG ; tokens.each_with_index { | token , i | print "DEBUG: tokens[#{i}]=#{token}\n" if IS_DEBUG_FILTER_NAME tokens[0] } ; end ;
- entry[RAW_ENTRY_KEY ] = line
- entry[PACKAGE_NAME_KEY ] = (tokens.shift ).gsub("\t" , '').strip
- entry[REPLACEMENT_KEY ] = (tokens.shift ).gsub("\t" , '').strip
- entry[REFERENCE_KEY ] = (tokens.shift ).gsub("\t" , '').strip
- entry[ENTRY_ID_KEY ] = (tokens.shift ).gsub("\t" , '').strip
- entry[DESCRIPTION_KEY ] = (tokens.join ':').gsub("\t" , '').strip
- entry[BLACKLIST_TAGS_KEY] = []
- # parse tags
- while (entry[DESCRIPTION_KEY].start_with? '[') && (entry[DESCRIPTION_KEY].include? ']')
- if DEBUG ; print "\n parsing tag for: #{entry[PACKAGE_NAME_KEY]}\n" ; print "desc IN=#{entry[DESCRIPTION_KEY]}\n" ; end ;
- # debugger if DEBUG && (DEBUG_FILTER_NAMES.include? entry[PACKAGE_NAME_KEY])
- entry[BLACKLIST_TAGS_KEY] << (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\1')
- entry[DESCRIPTION_KEY ] = (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\2')
- if DEBUG ; print "desc OUT=#{entry[DESCRIPTION_KEY]}\n" ; print "tags=#{entry[BLACKLIST_TAGS_KEY]}\n" ; sleep 0.2 ; end ;
- end
- if DEBUG ; print "\nno tag for: #{ entry[PACKAGE_NAME_KEY]}\n" if entry[BLACKLIST_TAGS_KEY].empty? ; end ;
- if DEBUG ; print "\nno desc for: #{entry[PACKAGE_NAME_KEY]}\n" if entry[DESCRIPTION_KEY ].empty? ; end ;
- end
- end
- ## process data ##
- entries.each do | entry |
- if DEBUG && (IS_DEBUG_FILTER_NAME entry[PACKAGE_NAME_KEY]) ; print "\n" ; entry.each_pair { | k , v | print "DEBUG: #{k}: #{v}\n" } ; end ;
- entry_name = entry[PACKAGE_NAME_KEY ]
- entry_desc = entry[DESCRIPTION_KEY ]
- entry_tags = entry[BLACKLIST_TAGS_KEY]
- entry_unspecified_tags = entry_tags - ACCEPTABLE_TAGS
- entry_freqs[entry_name] = (entry_freqs[entry_name] ||= 0) + 1
- entries_no_desc << entry if entry_desc .empty?
- entries_no_tags << entry if entry_tags .empty?
- entries_unspecified_tags << entry unless entry_unspecified_tags.empty?
- unspecified_tags.merge entry_unspecified_tags
- end
- duplicate_names = entry_freqs.keys.select { | name | entry_freqs[name] > 1 }
- incomplete_entries = entries_no_desc + entries_no_tags
- unreplaced_entries = entries.select { | entry | entry[REPLACEMENT_KEY].empty? }
- neglected_entries = incomplete_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set &
- unreplaced_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set
- duplicate_names.each do | duplicate_name |
- # next unless IS_DEBUG_FILTER_NAME duplicate_name ; p "duplicate_name=#{duplicate_name}" # DEBUG
- duplicate_entries = entries.select { | entry | entry[PACKAGE_NAME_KEY] == duplicate_name } \
- .map! { | entry | entry[RAW_ENTRY_KEY ] }
- unique_entries = duplicate_entries.uniq
- n_unique_entries = unique_entries.size
- unique_entries.each do | uniq_value |
- n_identical_entries = duplicate_entries.count { | dup_entry | dup_entry == uniq_value }
- duplicate_identical_entries[duplicate_name] = uniq_value + " (#{n_identical_entries} identical)" if n_identical_entries > 1
- end
- if n_unique_entries > 1
- duplicate_differing_entries[duplicate_name] = unique_entries
- end
- end
- ## report ##
- print REPORT_SEPARATOR
- print "entries found: #{ (entries + entries_invalid ).size }\n" if DO_PRINT_STATS
- print "entries valid: #{ (entries ).size }\n" if DO_PRINT_STATS
- print "entries_invalid: #{ (entries_invalid ).size }\n" if DO_PRINT_STATS
- print "entries lacking tags: #{ (entries_no_tags ).size }\n" if DO_PRINT_STATS
- print "entries lacking description: #{(entries_no_desc ).size }\n" if DO_PRINT_STATS
- print "unspecified tags: #{ (unspecified_tags ).size }\n" if DO_PRINT_STATS
- print "neglected entries: #{ (neglected_entries ).size }\n" if DO_PRINT_STATS
- print "duplicate_names: #{ (duplicate_names ).size }\n" if DO_PRINT_STATS
- print " identical entries: #{ (duplicate_identical_entries).size }\n" if DO_PRINT_STATS
- print " differing entries: #{ (duplicate_differing_entries).keys.size}\n" if DO_PRINT_STATS
- if DO_PRINT_INCOMPLETE
- { 'invalid entries' => entries_invalid ,
- 'entries lacking description' => entries_no_desc ,
- 'entries lacking tags' => entries_no_tags ,
- 'entries with unspecified tags' => entries_unspecified_tags ,
- 'unspecified tags' => unspecified_tags }.each_pair do | label , data |
- print REPORT_SEPARATOR + "#{label}:\n" unless data.empty?
- data.each { | entry | print " #{((entry.is_a? Hash) ? entry[RAW_ENTRY_KEY] : entry).strip}\n" }
- end
- end
- unless neglected_entries.empty? || ! DO_PRINT_INCOMPLETE
- print REPORT_SEPARATOR + "neglected entries:\n"
- neglected_entries.each { | entry_name | print " #{entry_name}\n" }
- end
- unless duplicate_names.empty? || ! DO_PRINT_DUPLICATE
- print REPORT_SEPARATOR + "duplicate entries:\n"
- duplicate_names.each do | duplicate_name |
- identical_entry = duplicate_identical_entries[duplicate_name]
- differing_entries = duplicate_differing_entries[duplicate_name]
- print "\n #{duplicate_name}:\n"
- print " identical entries:\n" unless identical_entry .nil?
- print " #{identical_entry}\n" unless identical_entry .nil?
- print " differing entries:\n" unless differing_entries.nil?
- differing_entries.each { | entry | print " #{entry}\n" } unless differing_entries.nil?
- end
- end
- ## sanity check ##
- should_quit = ! (entries_invalid.empty? && unspecified_tags.empty? && duplicate_names.empty?)
- (print "errors were found - JSON will not be generated\n" ; exit 1) if should_quit
- ## generate JSON ##
- IO.write OUTPUT_JSON_FILE , entries.to_json ; print "\nwrote: #{OUTPUT_JSON_FILE}\n\ndone\n" ;
|