CentOS HA neutron gre cluster deployment failed with bash -c "source /root/openrc ; neutron net-list --http-timeout=4 " 2>&1 > /dev/null returned 1 instead of one of [0]

Bug #1391213 reported by Andrey Sledzinskiy
10
This bug affects 2 people
Affects Status Importance Assigned to Milestone
Fuel for OpenStack
Invalid
High
Stanislav Makar

Bug Description

{

    "build_id": "2014-11-08_21-30-24",
    "ostf_sha": "9c6fadca272427bb933bc459e14bb1bad7f614aa",
    "build_number": "85",
    "auth_required": true,
    "api": "1.0",
    "nailgun_sha": "8330f6221e190db87fc5baa735fa719c85a2e02d",
    "production": "docker",
    "fuelmain_sha": "f536d11fb40fed78c26f4d7326c45b0c48f157d8",
    "astute_sha": "3c374c9f7bfbdbcd7ce2f716cd704e3044e6fb41",
    "feature_groups": [
        "mirantis"
    ],
    "release": "6.0",
    "release_versions": {
        "2014.2-6.0": {
            "VERSION": {
                "build_id": "2014-11-08_21-30-24",
                "ostf_sha": "9c6fadca272427bb933bc459e14bb1bad7f614aa",
                "build_number": "85",
                "api": "1.0",
                "nailgun_sha": "8330f6221e190db87fc5baa735fa719c85a2e02d",
                "production": "docker",
                "fuelmain_sha": "f536d11fb40fed78c26f4d7326c45b0c48f157d8",
                "astute_sha": "3c374c9f7bfbdbcd7ce2f716cd704e3044e6fb41",
                "feature_groups": [
                    "mirantis"
                ],
                "release": "6.0",
                "fuellib_sha": "c7b71bd1ee939b5a634715ac7b13c2936ad93a5e"
            }
        }
    },
    "fuellib_sha": "c7b71bd1ee939b5a634715ac7b13c2936ad93a5e"

}

Steps:
1. Create next cluster - CentOS, Ha, Neutron Gre, 3 controller, 2 compute, 1 cinder nodes
2. Deploy cluster

Deployment failed on first controller (node-1) with

2014-11-10 05:30:29 ERR

 (/Stage[main]/Keystone::Roles::Admin/Keystone_user[admin]) Could not evaluate: Execution of '/usr/bin/keystone --os-auth-url http://10.108.32.3:35357/v2.0/ token-get' returned 1: Authorization Failed: Request to http://10.108.32.3:35357/v2.0/tokens timed out (HTTP 408)

2014-11-10 05:07:11 ERR

 (/Stage[main]/Openstack::Network/Exec[waiting-for-neutron-api]/returns) change from notrun to 0 failed: bash -c "source /root/openrc ; neutron net-list --http-timeout=4 " 2>&1 > /dev/null returned 1 instead of one of [0]

2014-11-10 05:07:11 ERR

 /usr/bin/puppet:4

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/util/command_line.rb:91:in `execute'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/util/command_line.rb:137:in `run'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/application.rb:364:in `run'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/util.rb:478:in `exit_on_fail'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/application.rb:364:in `run'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/application.rb:470:in `plugin_hook'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/application.rb:364:in `run'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/application/apply.rb:146:in `run_command'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/application/apply.rb:218:in `main'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/application/apply.rb:268:in `apply_catalog'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/configurer.rb:192:in `run'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/configurer.rb:124:in `apply_catalog'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/util.rb:160:in `benchmark'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/1.8/benchmark.rb:308:in `realtime'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/util.rb:161:in `benchmark'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/configurer.rb:125:in `apply_catalog'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/resource/catalog.rb:163:in `apply'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/transaction/report.rb:108:in `as_logging_destination'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/util/log.rb:149:in `with_destination'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/resource/catalog.rb:164:in `apply'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/transaction.rb:108:in `evaluate'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/graph/relationship_graph.rb:118:in `traverse'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/transaction.rb:117:in `evaluate'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/util.rb:326:in `thinmark'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/1.8/benchmark.rb:308:in `realtime'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/util.rb:327:in `thinmark'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/transaction.rb:117:in `evaluate'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/transaction.rb:117:in `call'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/transaction.rb:187:in `eval_resource'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/transaction.rb:174:in `apply'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/transaction/resource_harness.rb:18:in `evaluate'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/transaction/resource_harness.rb:81:in `perform_changes'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/transaction/resource_harness.rb:81:in `each'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/transaction/resource_harness.rb:82:in `perform_changes'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/transaction/resource_harness.rb:130:in `sync_if_needed'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/transaction/resource_harness.rb:193:in `sync'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/type/exec.rb:120:in `sync'

2014-11-10 05:07:11 ERR

 /usr/lib/ruby/site_ruby/1.8/puppet/util/errors.rb:97:in `fail'

2014-11-10 05:07:11 ERR

 bash -c "source /root/openrc ; neutron net-list --http-timeout=4 " 2>&1 > /dev/null returned 1 instead of one of [0]

Logs are attached

Revision history for this message
Andrey Sledzinskiy (asledzinskiy) wrote :
Revision history for this message
Dennis Dmitriev (ddmitriev) wrote :

Reproduced on Jenkins job: http://jenkins-product.srt.mirantis.net:8080/job/6.0.ubuntu.promo_bvt/35/

[root@nailgun ~]# fuel node
id | status | name | cluster | ip | mac | roles | pending_roles | online | group_id
---|-------------|------------------------------|---------|-------------|-------------------|----------------------|---------------|--------|---------
4 | provisioned | slave-02_controller_ceph-osd | 1 | 10.108.60.4 | 64:37:68:fe:8e:da | ceph-osd, controller | | True | 1
1 | error | slave-01_controller_ceph-osd | 1 | 10.108.60.3 | 82:0b:57:1a:f2:45 | ceph-osd, controller | | True | 1
6 | provisioned | slave-04_compute_ceph-osd | 1 | 10.108.60.6 | 64:27:e4:de:6a:c8 | ceph-osd, compute | | True | 1
5 | provisioned | slave-05_compute_ceph-osd | 1 | 10.108.60.7 | 64:6b:96:77:86:9e | ceph-osd, compute | | True | 1
3 | provisioned | slave-03_controller_ceph-osd | 1 | 10.108.60.5 | 64:bb:97:cd:22:cb | ceph-osd, controller | | True | 1
2 | provisioned | slave-06_ceph-osd | 1 | 10.108.60.8 | 64:b4:78:43:da:58 | ceph-osd | | True | 1

root@node-1:~# source /root/openrc ; neutron net-list --http-timeout=4
Gateway Timeout (HTTP 504)

Revision history for this message
Dennis Dmitriev (ddmitriev) wrote :
Revision history for this message
Dennis Dmitriev (ddmitriev) wrote :

From Neutron server log:

===============
2014-11-13 17:43:30.503 25145 CRITICAL neutron [-] OperationalError: (OperationalError) no such table: ml2_vlan_allocations u'SELECT ml2_vlan_allocations.physical_network AS ml2_vlan_allocations_physical_network, ml2_vlan_allocations.vlan_id AS ml2_vlan_allocations_vlan_id, ml2_vlan_allocations.allocated AS ml2_vlan_allocations_allocated \nFROM ml2_vlan_allocations' ()

Changed in fuel:
status: New → Confirmed
Revision history for this message
Stanislav Makar (smakar) wrote :
Changed in fuel:
assignee: Fuel Library Team (fuel-library) → Stanislav Makar (smakar)
Stanislav Makar (smakar)
Changed in fuel:
status: Confirmed → In Progress
Changed in fuel:
assignee: Stanislav Makar (smakar) → Bogdan Dobrelya (bogdando)
Revision history for this message
Bogdan Dobrelya (bogdando) wrote :

Here is log analysis from https://bugs.launchpad.net/fuel/+bug/1391213/comments/3

http://paste.openstack.org/show/134709/
1.Wait for synced state and wait for backend reported OK, but there are no '[Note] /usr/sbin/mysqld: ready for connections' in mysql log (Note: is it really an issue with mysql accessibility or not?)

2. Neutron server service started before its DB schema prepared (dup of https://bugs.launchpad.net/fuel/+bug/1393772)

3. Gateway timeout errors from keystone due to we do not wait for keystone haproxy backend for neutron case (and/or mysql is not ready for connections yet?) Note: looks like a new bug, we should always wait for keystone haproxy backend.

4. Finished puppet apply and started a new one

5. Only now mysql is ready for connections, but deployment is failed allready.

So, first, I guess the correct sequence with mysql events is only:
[Note] /usr/sbin/mysqld: ready for connections -> /Stage[main]/Galera/Exec[wait-for-synced-state]
but not vice versa as we can see here.

Second, we should fix wait for keystone haproxy backend https://github.com/stackforge/fuel-library/blob/master/deployment/puppet/osnailyfacter/manifests/cluster_ha.pp#L545-L571

Third, close the related bug https://bugs.launchpad.net/fuel/+bug/1393772

Revision history for this message
Stanislav Makar (smakar) wrote :

It's a very interesting bug due to it's floating bug:
One time I reproduced it, next two time on the same env(just made env reset), I haven't caught it again.
First time - I have investigated and found that the problem is with keystone, which doesn't give tokens:
Precisely at moment keystone works with memcached, but memcached works fine
I tried to debug and found that the problem at /usr/lib/python2.6/site-packages/keystone/common/kvs/core.py :
line 379 - self.mutex.acquire() , function def acquire(self):

{"build_id": "2014-11-16_22-00-23", "ostf_sha": "82465a94eed4eff1fc8d8e1f2fb7e9993c22f068", "build_number": "110", "auth_required": true, "api": "1.0", "nailgun_sha": "3e98ea8a042b43be8c00b9f852e19b4c0b853404", "production": "docker", "fuelmain_sha": "137d944d4a658e0358349db0daa2b9bc21ae8218", "astute_sha": "8e05026cf17aeb92e2fb028b3b95ba34803508ed", "feature_groups": ["mirantis"], "release": "6.0", "release_versions": {"2014.2-6.0": {"VERSION": {"build_id": "2014-11-16_22-00-23", "ostf_sha": "82465a94eed4eff1fc8d8e1f2fb7e9993c22f068", "build_number": "110", "api": "1.0", "nailgun_sha": "3e98ea8a042b43be8c00b9f852e19b4c0b853404", "production": "docker", "fuelmain_sha": "137d944d4a658e0358349db0daa2b9bc21ae8218", "astute_sha": "8e05026cf17aeb92e2fb028b3b95ba34803508ed", "feature_groups": ["mirantis"], "release": "6.0", "fuellib_sha": "4c080377d5d1f4397a257e26c836055889fa3138"}}}, "fuellib_sha": "4c080377d5d1f4397a257e26c836055889fa3138"}

Revision history for this message
Bogdan Dobrelya (bogdando) wrote :
Revision history for this message
Bogdan Dobrelya (bogdando) wrote :
Revision history for this message
Bogdan Dobrelya (bogdando) wrote :

@Stanislav, yes, both related bugs are floating due to puppet catalog run deterministic specifics. Some time it puts resources in the way we do not expect, and this bug shows just one more such a case

Revision history for this message
Stanislav Makar (smakar) wrote :

@Bogdan I have contacted @Alexander Makarov
I have the snapshot with affected environment and we applied the patch https://review.fuel-infra.org/#/c/832
and it fixes the bug.
So it isn't related to bugs, you mentioned.
This patch is merged now and is already presented in new packages( I've checked).

Changed in fuel:
assignee: Bogdan Dobrelya (bogdando) → Stanislav Makar (smakar)
Revision history for this message
Vladimir Kuklin (vkuklin) wrote :

This bug is related to keystone-memcached issues, which were fixed. Marking as invalid

Changed in fuel:
status: In Progress → Invalid
To post a comment you must log in.
This report contains Public information  
Everyone can see this information.

Other bug subscribers

Remote bug watches

Bug watches keep track of this bug in other bug trackers.