How to parse huge XML files with Ruby and Nokogiri (without using too much RAM)

Parse huge XML files, without using too much RAM, with Ruby and Nokogiri and the following code: ```ruby require 'nokogiri' # Public: BigXML helps you parse XML efficiently with minimal RAM usage. Parse 1GB, 2GB, 100GB, whatever and so on.. # # Examples: # # Filter an XML file efficiently by selecting only users, groups and messages. # File.open(ARGV[1], 'w') do |out_file| # xml = BigXML.new(ARGV[0]) # xml.each_node do |node, path| # # users # if node.name == 'user' # or use the element's path: path == 'export/users/user' # out_file << node.outer_xml # # groups # elsif node.name == 'group' # or use the element's path and content: path == 'export/groups/group' && node.outer_xml.match(/false/m) # out_file << node.outer_xml # # messages # elsif node.name == 'message' # or use the element's path and content: path == 'export/messages/message' && node.outer_xml.match(/false/m) # out_file << node.outer_xml # end # end # end # class BigXML # Public: Initializes a parser. # # xml_file - The path of the XML file you want to parse def initialize(xml_file) raise ArgumentError, "Please provide the path of the XML file, not a #{xml_file.class}" unless xml_file.is_a?(String) @xml_file = xml_file end # Public: Iterate over each node in the XML document. # # attributes_in_path - Default false. Setting this to true will include attributes in the node path, e.g. /groups/@id=1. instead of just /groups # # Yields the node (Nokogiri::XML::Reader) and path (String) of the current XML node. # # Returns nothing. def each_node(attributes_in_path=false) reader = Nokogiri::XML::Reader(File.open(@xml_file)) nodes = [''] reader.each do |node| # start tag if node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT. # store path if attributes_in_path && node.attributes.size > 0 attributes = [] node.attributes.sort.each do |name, value| attributes << "@#{name}=#{value}" end nodes << "#{node.name}/#{attributes.join('/')}" else nodes << node.name end path = nodes.join('/') yield node, path end # end tag if node.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT || node.self_closing? nodes.pop end end end end if __FILE__ == $0 require 'minitest/unit' class TestBigXML < MiniTest::Unit::TestCase def test_grep xml = BigXML.new(ARGV[0]) users = 0 groups = 0 messages = 0 File.open(ARGV[1], 'w') do |out_file| xml.each_node do |node, path| # users if node.name == 'user' && path == '/export/users/user' users += 1 out_file << node.outer_xml out_file << "\n" # groups elsif node.name == 'group' && path == '/export/groups/group'. doc = Nokogiri::XML.parse(node.outer_xml) group = doc.at('/group/private') is_public = group && group.inner_text == 'false' if is_public groups += 1 out_file << node.outer_xml out_file << "\n" end # messages elsif node.name == 'message' && path == '/export/messages/message' doc = Nokogiri::XML.parse(node.outer_xml) group = doc.at('/message/group/private') is_public = group && group.inner_text == 'false' if is_public messages += 1 out_file << node.outer_xml out_file << "\n" end end end assert_equal 100, users assert_equal 100, groups assert_equal 100, messages end end end MiniTest::Unit.autorun end ``` [BigXML on Github](http://github.com/christianhellsten/big-xml)